{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 311, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003215434083601286, "grad_norm": 7.461493968963623, "learning_rate": 0.0, "loss": 1.4509, "step": 1 }, { "epoch": 0.006430868167202572, "grad_norm": 7.841219425201416, "learning_rate": 3.125e-07, "loss": 1.4192, "step": 2 }, { "epoch": 0.00964630225080386, "grad_norm": 8.970290184020996, "learning_rate": 6.25e-07, "loss": 1.4011, "step": 3 }, { "epoch": 0.012861736334405145, "grad_norm": 6.137041091918945, "learning_rate": 9.375000000000001e-07, "loss": 1.3749, "step": 4 }, { "epoch": 0.01607717041800643, "grad_norm": 5.494370460510254, "learning_rate": 1.25e-06, "loss": 1.3428, "step": 5 }, { "epoch": 0.01929260450160772, "grad_norm": 4.324854373931885, "learning_rate": 1.5625e-06, "loss": 1.2579, "step": 6 }, { "epoch": 0.022508038585209004, "grad_norm": 5.789402008056641, "learning_rate": 1.8750000000000003e-06, "loss": 1.2162, "step": 7 }, { "epoch": 0.02572347266881029, "grad_norm": 4.819009304046631, "learning_rate": 2.1875000000000002e-06, "loss": 1.1246, "step": 8 }, { "epoch": 0.028938906752411574, "grad_norm": 4.173788070678711, "learning_rate": 2.5e-06, "loss": 1.0988, "step": 9 }, { "epoch": 0.03215434083601286, "grad_norm": 3.964716911315918, "learning_rate": 2.8125e-06, "loss": 1.0293, "step": 10 }, { "epoch": 0.03536977491961415, "grad_norm": 3.7624258995056152, "learning_rate": 3.125e-06, "loss": 1.0261, "step": 11 }, { "epoch": 0.03858520900321544, "grad_norm": 2.7567455768585205, "learning_rate": 3.4375e-06, "loss": 0.9731, "step": 12 }, { "epoch": 0.04180064308681672, "grad_norm": 11.352989196777344, "learning_rate": 3.7500000000000005e-06, "loss": 0.9391, "step": 13 }, { "epoch": 0.04501607717041801, "grad_norm": 2.96602201461792, "learning_rate": 4.0625000000000005e-06, "loss": 0.8457, "step": 14 }, { "epoch": 0.04823151125401929, "grad_norm": 3.602654218673706, "learning_rate": 4.3750000000000005e-06, "loss": 0.854, "step": 15 }, { "epoch": 0.05144694533762058, "grad_norm": 3.3031013011932373, "learning_rate": 4.6875000000000004e-06, "loss": 0.7838, "step": 16 }, { "epoch": 0.05466237942122187, "grad_norm": 3.8468689918518066, "learning_rate": 5e-06, "loss": 0.8028, "step": 17 }, { "epoch": 0.05787781350482315, "grad_norm": 5.588563919067383, "learning_rate": 5.3125e-06, "loss": 0.7586, "step": 18 }, { "epoch": 0.06109324758842444, "grad_norm": 3.002431631088257, "learning_rate": 5.625e-06, "loss": 0.7981, "step": 19 }, { "epoch": 0.06430868167202572, "grad_norm": 2.289362668991089, "learning_rate": 5.9375e-06, "loss": 0.6801, "step": 20 }, { "epoch": 0.06752411575562701, "grad_norm": 2.4948556423187256, "learning_rate": 6.25e-06, "loss": 0.7076, "step": 21 }, { "epoch": 0.0707395498392283, "grad_norm": 3.449002742767334, "learning_rate": 6.5625e-06, "loss": 0.6911, "step": 22 }, { "epoch": 0.07395498392282958, "grad_norm": 2.8027281761169434, "learning_rate": 6.875e-06, "loss": 0.6124, "step": 23 }, { "epoch": 0.07717041800643087, "grad_norm": 3.1212947368621826, "learning_rate": 7.1875e-06, "loss": 0.6253, "step": 24 }, { "epoch": 0.08038585209003216, "grad_norm": 2.3612632751464844, "learning_rate": 7.500000000000001e-06, "loss": 0.6117, "step": 25 }, { "epoch": 0.08360128617363344, "grad_norm": 3.0025129318237305, "learning_rate": 7.8125e-06, "loss": 0.5949, "step": 26 }, { "epoch": 0.08681672025723473, "grad_norm": 2.812004566192627, "learning_rate": 8.125000000000001e-06, "loss": 0.5983, "step": 27 }, { "epoch": 0.09003215434083602, "grad_norm": 3.4180963039398193, "learning_rate": 8.4375e-06, "loss": 0.5706, "step": 28 }, { "epoch": 0.0932475884244373, "grad_norm": 3.952913522720337, "learning_rate": 8.750000000000001e-06, "loss": 0.536, "step": 29 }, { "epoch": 0.09646302250803858, "grad_norm": 2.5324349403381348, "learning_rate": 9.0625e-06, "loss": 0.5384, "step": 30 }, { "epoch": 0.09967845659163987, "grad_norm": 2.9339852333068848, "learning_rate": 9.375000000000001e-06, "loss": 0.5491, "step": 31 }, { "epoch": 0.10289389067524116, "grad_norm": 2.1388156414031982, "learning_rate": 9.6875e-06, "loss": 0.524, "step": 32 }, { "epoch": 0.10610932475884244, "grad_norm": 2.724695920944214, "learning_rate": 1e-05, "loss": 0.5224, "step": 33 }, { "epoch": 0.10932475884244373, "grad_norm": 2.3847827911376953, "learning_rate": 9.999683023724021e-06, "loss": 0.5185, "step": 34 }, { "epoch": 0.11254019292604502, "grad_norm": 5.435914039611816, "learning_rate": 9.998732135085665e-06, "loss": 0.5219, "step": 35 }, { "epoch": 0.1157556270096463, "grad_norm": 2.3512284755706787, "learning_rate": 9.99714745464859e-06, "loss": 0.4951, "step": 36 }, { "epoch": 0.1189710610932476, "grad_norm": 2.4358675479888916, "learning_rate": 9.994929183335237e-06, "loss": 0.5087, "step": 37 }, { "epoch": 0.12218649517684887, "grad_norm": 4.703172206878662, "learning_rate": 9.992077602401358e-06, "loss": 0.4586, "step": 38 }, { "epoch": 0.12540192926045016, "grad_norm": 1.9784342050552368, "learning_rate": 9.988593073400354e-06, "loss": 0.4962, "step": 39 }, { "epoch": 0.12861736334405144, "grad_norm": 2.9739573001861572, "learning_rate": 9.984476038137437e-06, "loss": 0.5232, "step": 40 }, { "epoch": 0.13183279742765272, "grad_norm": 4.0715413093566895, "learning_rate": 9.979727018613607e-06, "loss": 0.4603, "step": 41 }, { "epoch": 0.13504823151125403, "grad_norm": 3.099459409713745, "learning_rate": 9.974346616959476e-06, "loss": 0.456, "step": 42 }, { "epoch": 0.1382636655948553, "grad_norm": 3.039191246032715, "learning_rate": 9.968335515358916e-06, "loss": 0.45, "step": 43 }, { "epoch": 0.1414790996784566, "grad_norm": 22.894182205200195, "learning_rate": 9.961694475962562e-06, "loss": 0.4605, "step": 44 }, { "epoch": 0.14469453376205788, "grad_norm": 2.549868106842041, "learning_rate": 9.954424340791195e-06, "loss": 0.4636, "step": 45 }, { "epoch": 0.14790996784565916, "grad_norm": 2.1484735012054443, "learning_rate": 9.94652603162896e-06, "loss": 0.4533, "step": 46 }, { "epoch": 0.15112540192926044, "grad_norm": 2.6148245334625244, "learning_rate": 9.938000549906509e-06, "loss": 0.442, "step": 47 }, { "epoch": 0.15434083601286175, "grad_norm": 3.580359697341919, "learning_rate": 9.92884897657402e-06, "loss": 0.477, "step": 48 }, { "epoch": 0.15755627009646303, "grad_norm": 2.5778746604919434, "learning_rate": 9.919072471964146e-06, "loss": 0.4266, "step": 49 }, { "epoch": 0.1607717041800643, "grad_norm": 9.264074325561523, "learning_rate": 9.908672275644898e-06, "loss": 0.4375, "step": 50 }, { "epoch": 0.1639871382636656, "grad_norm": 3.2539267539978027, "learning_rate": 9.897649706262474e-06, "loss": 0.4438, "step": 51 }, { "epoch": 0.16720257234726688, "grad_norm": 3.373600721359253, "learning_rate": 9.88600616137407e-06, "loss": 0.4388, "step": 52 }, { "epoch": 0.17041800643086816, "grad_norm": 5.1715898513793945, "learning_rate": 9.873743117270691e-06, "loss": 0.4612, "step": 53 }, { "epoch": 0.17363344051446947, "grad_norm": 4.894754409790039, "learning_rate": 9.860862128789954e-06, "loss": 0.4513, "step": 54 }, { "epoch": 0.17684887459807075, "grad_norm": 3.1228713989257812, "learning_rate": 9.847364829118963e-06, "loss": 0.4439, "step": 55 }, { "epoch": 0.18006430868167203, "grad_norm": 5.72307014465332, "learning_rate": 9.833252929587231e-06, "loss": 0.4484, "step": 56 }, { "epoch": 0.1832797427652733, "grad_norm": 7.6115336418151855, "learning_rate": 9.818528219449705e-06, "loss": 0.4642, "step": 57 }, { "epoch": 0.1864951768488746, "grad_norm": 4.580008506774902, "learning_rate": 9.803192565659898e-06, "loss": 0.4289, "step": 58 }, { "epoch": 0.18971061093247588, "grad_norm": 4.601083278656006, "learning_rate": 9.78724791263318e-06, "loss": 0.416, "step": 59 }, { "epoch": 0.19292604501607716, "grad_norm": 5.066440105438232, "learning_rate": 9.770696282000245e-06, "loss": 0.4083, "step": 60 }, { "epoch": 0.19614147909967847, "grad_norm": 4.051520824432373, "learning_rate": 9.753539772350792e-06, "loss": 0.4177, "step": 61 }, { "epoch": 0.19935691318327975, "grad_norm": 3.406569242477417, "learning_rate": 9.735780558967434e-06, "loss": 0.4328, "step": 62 }, { "epoch": 0.20257234726688103, "grad_norm": 4.951582908630371, "learning_rate": 9.717420893549902e-06, "loss": 0.424, "step": 63 }, { "epoch": 0.2057877813504823, "grad_norm": 2.5664072036743164, "learning_rate": 9.698463103929542e-06, "loss": 0.4254, "step": 64 }, { "epoch": 0.2090032154340836, "grad_norm": 2.8900935649871826, "learning_rate": 9.67890959377418e-06, "loss": 0.4202, "step": 65 }, { "epoch": 0.21221864951768488, "grad_norm": 3.5653440952301025, "learning_rate": 9.658762842283343e-06, "loss": 0.397, "step": 66 }, { "epoch": 0.21543408360128619, "grad_norm": 2.600797414779663, "learning_rate": 9.638025403873939e-06, "loss": 0.3912, "step": 67 }, { "epoch": 0.21864951768488747, "grad_norm": 7.3537397384643555, "learning_rate": 9.616699907856368e-06, "loss": 0.3758, "step": 68 }, { "epoch": 0.22186495176848875, "grad_norm": 5.298691272735596, "learning_rate": 9.594789058101154e-06, "loss": 0.4368, "step": 69 }, { "epoch": 0.22508038585209003, "grad_norm": 2.9018328189849854, "learning_rate": 9.57229563269612e-06, "loss": 0.4067, "step": 70 }, { "epoch": 0.2282958199356913, "grad_norm": 2.2843472957611084, "learning_rate": 9.549222483594154e-06, "loss": 0.3884, "step": 71 }, { "epoch": 0.2315112540192926, "grad_norm": 4.242974281311035, "learning_rate": 9.525572536251608e-06, "loss": 0.3895, "step": 72 }, { "epoch": 0.2347266881028939, "grad_norm": 3.73633074760437, "learning_rate": 9.501348789257373e-06, "loss": 0.408, "step": 73 }, { "epoch": 0.2379421221864952, "grad_norm": 7.407820224761963, "learning_rate": 9.476554313952697e-06, "loss": 0.3862, "step": 74 }, { "epoch": 0.24115755627009647, "grad_norm": 4.708957672119141, "learning_rate": 9.451192254041759e-06, "loss": 0.4149, "step": 75 }, { "epoch": 0.24437299035369775, "grad_norm": 6.709017276763916, "learning_rate": 9.425265825193077e-06, "loss": 0.38, "step": 76 }, { "epoch": 0.24758842443729903, "grad_norm": 2.1756374835968018, "learning_rate": 9.398778314631801e-06, "loss": 0.3799, "step": 77 }, { "epoch": 0.2508038585209003, "grad_norm": 3.1432833671569824, "learning_rate": 9.371733080722911e-06, "loss": 0.3882, "step": 78 }, { "epoch": 0.2540192926045016, "grad_norm": 2.0718796253204346, "learning_rate": 9.34413355254542e-06, "loss": 0.4123, "step": 79 }, { "epoch": 0.2572347266881029, "grad_norm": 3.231426954269409, "learning_rate": 9.31598322945759e-06, "loss": 0.3627, "step": 80 }, { "epoch": 0.2604501607717042, "grad_norm": 3.3169357776641846, "learning_rate": 9.287285680653254e-06, "loss": 0.3747, "step": 81 }, { "epoch": 0.26366559485530544, "grad_norm": 1.9841314554214478, "learning_rate": 9.258044544709276e-06, "loss": 0.399, "step": 82 }, { "epoch": 0.26688102893890675, "grad_norm": 2.6931841373443604, "learning_rate": 9.228263529124199e-06, "loss": 0.3995, "step": 83 }, { "epoch": 0.27009646302250806, "grad_norm": 2.48873233795166, "learning_rate": 9.197946409848196e-06, "loss": 0.4221, "step": 84 }, { "epoch": 0.2733118971061093, "grad_norm": 20.441673278808594, "learning_rate": 9.167097030804289e-06, "loss": 0.3649, "step": 85 }, { "epoch": 0.2765273311897106, "grad_norm": 2.6681602001190186, "learning_rate": 9.135719303400995e-06, "loss": 0.3638, "step": 86 }, { "epoch": 0.2797427652733119, "grad_norm": 4.435401439666748, "learning_rate": 9.103817206036383e-06, "loss": 0.3722, "step": 87 }, { "epoch": 0.2829581993569132, "grad_norm": 5.914163589477539, "learning_rate": 9.071394783593664e-06, "loss": 0.3656, "step": 88 }, { "epoch": 0.2861736334405145, "grad_norm": 6.216729640960693, "learning_rate": 9.038456146928325e-06, "loss": 0.3916, "step": 89 }, { "epoch": 0.28938906752411575, "grad_norm": 2.873570442199707, "learning_rate": 9.005005472346923e-06, "loss": 0.3903, "step": 90 }, { "epoch": 0.29260450160771706, "grad_norm": 4.470005035400391, "learning_rate": 8.971047001077561e-06, "loss": 0.3987, "step": 91 }, { "epoch": 0.2958199356913183, "grad_norm": 2.5284571647644043, "learning_rate": 8.936585038732143e-06, "loss": 0.4044, "step": 92 }, { "epoch": 0.2990353697749196, "grad_norm": 2.339695692062378, "learning_rate": 8.90162395476046e-06, "loss": 0.3858, "step": 93 }, { "epoch": 0.3022508038585209, "grad_norm": 2.0064709186553955, "learning_rate": 8.866168181896198e-06, "loss": 0.4002, "step": 94 }, { "epoch": 0.3054662379421222, "grad_norm": 3.07234525680542, "learning_rate": 8.83022221559489e-06, "loss": 0.375, "step": 95 }, { "epoch": 0.3086816720257235, "grad_norm": 2.4521424770355225, "learning_rate": 8.793790613463956e-06, "loss": 0.3549, "step": 96 }, { "epoch": 0.31189710610932475, "grad_norm": 2.3006341457366943, "learning_rate": 8.756877994684818e-06, "loss": 0.3798, "step": 97 }, { "epoch": 0.31511254019292606, "grad_norm": 3.3463075160980225, "learning_rate": 8.719489039427256e-06, "loss": 0.3871, "step": 98 }, { "epoch": 0.3183279742765273, "grad_norm": 2.5349507331848145, "learning_rate": 8.681628488255986e-06, "loss": 0.4025, "step": 99 }, { "epoch": 0.3215434083601286, "grad_norm": 2.8825855255126953, "learning_rate": 8.643301141529619e-06, "loss": 0.3998, "step": 100 }, { "epoch": 0.3247588424437299, "grad_norm": 4.388237953186035, "learning_rate": 8.604511858792006e-06, "loss": 0.3714, "step": 101 }, { "epoch": 0.3279742765273312, "grad_norm": 2.6666557788848877, "learning_rate": 8.565265558156101e-06, "loss": 0.3509, "step": 102 }, { "epoch": 0.3311897106109325, "grad_norm": 2.7230324745178223, "learning_rate": 8.525567215680397e-06, "loss": 0.366, "step": 103 }, { "epoch": 0.33440514469453375, "grad_norm": 2.4554688930511475, "learning_rate": 8.485421864737997e-06, "loss": 0.3919, "step": 104 }, { "epoch": 0.33762057877813506, "grad_norm": 7.866596221923828, "learning_rate": 8.444834595378434e-06, "loss": 0.3623, "step": 105 }, { "epoch": 0.3408360128617363, "grad_norm": 2.528653144836426, "learning_rate": 8.403810553682307e-06, "loss": 0.3758, "step": 106 }, { "epoch": 0.3440514469453376, "grad_norm": 2.836378335952759, "learning_rate": 8.362354941108803e-06, "loss": 0.3456, "step": 107 }, { "epoch": 0.34726688102893893, "grad_norm": 1.8620100021362305, "learning_rate": 8.320473013836197e-06, "loss": 0.3754, "step": 108 }, { "epoch": 0.3504823151125402, "grad_norm": 2.056680679321289, "learning_rate": 8.278170082095422e-06, "loss": 0.3858, "step": 109 }, { "epoch": 0.3536977491961415, "grad_norm": 1.9714686870574951, "learning_rate": 8.23545150949679e-06, "loss": 0.3941, "step": 110 }, { "epoch": 0.35691318327974275, "grad_norm": 2.2530500888824463, "learning_rate": 8.192322712349917e-06, "loss": 0.3712, "step": 111 }, { "epoch": 0.36012861736334406, "grad_norm": 1.7236007452011108, "learning_rate": 8.148789158977012e-06, "loss": 0.3532, "step": 112 }, { "epoch": 0.3633440514469453, "grad_norm": 1.8990964889526367, "learning_rate": 8.104856369019525e-06, "loss": 0.3801, "step": 113 }, { "epoch": 0.3665594855305466, "grad_norm": 5.287169933319092, "learning_rate": 8.060529912738316e-06, "loss": 0.3594, "step": 114 }, { "epoch": 0.36977491961414793, "grad_norm": 2.917484998703003, "learning_rate": 8.0158154103074e-06, "loss": 0.3696, "step": 115 }, { "epoch": 0.3729903536977492, "grad_norm": 2.5253026485443115, "learning_rate": 7.970718531101365e-06, "loss": 0.3553, "step": 116 }, { "epoch": 0.3762057877813505, "grad_norm": 2.7132797241210938, "learning_rate": 7.925244992976538e-06, "loss": 0.3775, "step": 117 }, { "epoch": 0.37942122186495175, "grad_norm": 5.237837791442871, "learning_rate": 7.879400561546033e-06, "loss": 0.3591, "step": 118 }, { "epoch": 0.38263665594855306, "grad_norm": 2.0805959701538086, "learning_rate": 7.833191049448706e-06, "loss": 0.3723, "step": 119 }, { "epoch": 0.3858520900321543, "grad_norm": 1.8187751770019531, "learning_rate": 7.786622315612182e-06, "loss": 0.3566, "step": 120 }, { "epoch": 0.3890675241157556, "grad_norm": 2.222515821456909, "learning_rate": 7.739700264509993e-06, "loss": 0.3809, "step": 121 }, { "epoch": 0.39228295819935693, "grad_norm": 8.328165054321289, "learning_rate": 7.692430845412946e-06, "loss": 0.3707, "step": 122 }, { "epoch": 0.3954983922829582, "grad_norm": 2.218949317932129, "learning_rate": 7.644820051634813e-06, "loss": 0.3642, "step": 123 }, { "epoch": 0.3987138263665595, "grad_norm": 1.9735389947891235, "learning_rate": 7.596873919772438e-06, "loss": 0.3605, "step": 124 }, { "epoch": 0.40192926045016075, "grad_norm": 3.412888526916504, "learning_rate": 7.548598528940354e-06, "loss": 0.3648, "step": 125 }, { "epoch": 0.40514469453376206, "grad_norm": 4.399238109588623, "learning_rate": 7.500000000000001e-06, "loss": 0.3735, "step": 126 }, { "epoch": 0.40836012861736337, "grad_norm": 1.8429063558578491, "learning_rate": 7.451084494783668e-06, "loss": 0.3775, "step": 127 }, { "epoch": 0.4115755627009646, "grad_norm": 2.099372386932373, "learning_rate": 7.401858215313228e-06, "loss": 0.3646, "step": 128 }, { "epoch": 0.41479099678456594, "grad_norm": 2.8833494186401367, "learning_rate": 7.352327403013779e-06, "loss": 0.3752, "step": 129 }, { "epoch": 0.4180064308681672, "grad_norm": 2.006443500518799, "learning_rate": 7.302498337922293e-06, "loss": 0.3567, "step": 130 }, { "epoch": 0.4212218649517685, "grad_norm": 2.024747371673584, "learning_rate": 7.2523773378913655e-06, "loss": 0.3623, "step": 131 }, { "epoch": 0.42443729903536975, "grad_norm": 1.9539835453033447, "learning_rate": 7.201970757788172e-06, "loss": 0.3709, "step": 132 }, { "epoch": 0.42765273311897106, "grad_norm": 1.9126152992248535, "learning_rate": 7.151284988688731e-06, "loss": 0.3518, "step": 133 }, { "epoch": 0.43086816720257237, "grad_norm": 1.9806180000305176, "learning_rate": 7.100326457067576e-06, "loss": 0.3623, "step": 134 }, { "epoch": 0.4340836012861736, "grad_norm": 4.260410785675049, "learning_rate": 7.049101623982938e-06, "loss": 0.3518, "step": 135 }, { "epoch": 0.43729903536977494, "grad_norm": 2.0884203910827637, "learning_rate": 6.9976169842575526e-06, "loss": 0.3812, "step": 136 }, { "epoch": 0.4405144694533762, "grad_norm": 4.204238414764404, "learning_rate": 6.945879065655164e-06, "loss": 0.3615, "step": 137 }, { "epoch": 0.4437299035369775, "grad_norm": 1.9794977903366089, "learning_rate": 6.893894428052881e-06, "loss": 0.3898, "step": 138 }, { "epoch": 0.44694533762057875, "grad_norm": 2.9515440464019775, "learning_rate": 6.841669662609437e-06, "loss": 0.3437, "step": 139 }, { "epoch": 0.45016077170418006, "grad_norm": 2.980576992034912, "learning_rate": 6.789211390929497e-06, "loss": 0.3523, "step": 140 }, { "epoch": 0.4533762057877814, "grad_norm": 4.675036907196045, "learning_rate": 6.736526264224101e-06, "loss": 0.3738, "step": 141 }, { "epoch": 0.4565916398713826, "grad_norm": 3.4226956367492676, "learning_rate": 6.6836209624673575e-06, "loss": 0.3726, "step": 142 }, { "epoch": 0.45980707395498394, "grad_norm": 2.1817691326141357, "learning_rate": 6.6305021935494755e-06, "loss": 0.3322, "step": 143 }, { "epoch": 0.4630225080385852, "grad_norm": 2.0901007652282715, "learning_rate": 6.5771766924262795e-06, "loss": 0.3328, "step": 144 }, { "epoch": 0.4662379421221865, "grad_norm": 1.8397691249847412, "learning_rate": 6.523651220265269e-06, "loss": 0.3492, "step": 145 }, { "epoch": 0.4694533762057878, "grad_norm": 2.156468391418457, "learning_rate": 6.469932563588386e-06, "loss": 0.3362, "step": 146 }, { "epoch": 0.47266881028938906, "grad_norm": 2.963684320449829, "learning_rate": 6.41602753341152e-06, "loss": 0.3438, "step": 147 }, { "epoch": 0.4758842443729904, "grad_norm": 1.6273006200790405, "learning_rate": 6.361942964380967e-06, "loss": 0.3434, "step": 148 }, { "epoch": 0.4790996784565916, "grad_norm": 2.0362226963043213, "learning_rate": 6.307685713906835e-06, "loss": 0.3487, "step": 149 }, { "epoch": 0.48231511254019294, "grad_norm": 1.889363169670105, "learning_rate": 6.2532626612936035e-06, "loss": 0.3335, "step": 150 }, { "epoch": 0.4855305466237942, "grad_norm": 5.194770336151123, "learning_rate": 6.1986807068678926e-06, "loss": 0.3578, "step": 151 }, { "epoch": 0.4887459807073955, "grad_norm": 2.6607186794281006, "learning_rate": 6.143946771103561e-06, "loss": 0.3585, "step": 152 }, { "epoch": 0.4919614147909968, "grad_norm": 1.6458920240402222, "learning_rate": 6.089067793744258e-06, "loss": 0.3163, "step": 153 }, { "epoch": 0.49517684887459806, "grad_norm": 1.745608925819397, "learning_rate": 6.034050732923538e-06, "loss": 0.3513, "step": 154 }, { "epoch": 0.4983922829581994, "grad_norm": 2.75510835647583, "learning_rate": 5.978902564282616e-06, "loss": 0.3436, "step": 155 }, { "epoch": 0.5016077170418006, "grad_norm": 3.112760543823242, "learning_rate": 5.923630280085948e-06, "loss": 0.3321, "step": 156 }, { "epoch": 0.5048231511254019, "grad_norm": 1.7312897443771362, "learning_rate": 5.8682408883346535e-06, "loss": 0.3464, "step": 157 }, { "epoch": 0.5080385852090032, "grad_norm": 3.926618814468384, "learning_rate": 5.8127414118779825e-06, "loss": 0.366, "step": 158 }, { "epoch": 0.5112540192926045, "grad_norm": 2.1648647785186768, "learning_rate": 5.757138887522884e-06, "loss": 0.3592, "step": 159 }, { "epoch": 0.5144694533762058, "grad_norm": 3.066451072692871, "learning_rate": 5.701440365141799e-06, "loss": 0.3374, "step": 160 }, { "epoch": 0.5176848874598071, "grad_norm": 1.5853915214538574, "learning_rate": 5.645652906778808e-06, "loss": 0.3354, "step": 161 }, { "epoch": 0.5209003215434084, "grad_norm": 1.4135924577713013, "learning_rate": 5.5897835857542315e-06, "loss": 0.3402, "step": 162 }, { "epoch": 0.5241157556270096, "grad_norm": 2.379409074783325, "learning_rate": 5.533839485767795e-06, "loss": 0.349, "step": 163 }, { "epoch": 0.5273311897106109, "grad_norm": 1.670160174369812, "learning_rate": 5.477827700000492e-06, "loss": 0.3314, "step": 164 }, { "epoch": 0.5305466237942122, "grad_norm": 2.1935579776763916, "learning_rate": 5.421755330215223e-06, "loss": 0.3147, "step": 165 }, { "epoch": 0.5337620578778135, "grad_norm": 3.3353383541107178, "learning_rate": 5.365629485856381e-06, "loss": 0.3427, "step": 166 }, { "epoch": 0.5369774919614148, "grad_norm": 1.6737383604049683, "learning_rate": 5.30945728314841e-06, "loss": 0.3091, "step": 167 }, { "epoch": 0.5401929260450161, "grad_norm": 1.815943717956543, "learning_rate": 5.253245844193564e-06, "loss": 0.3197, "step": 168 }, { "epoch": 0.5434083601286174, "grad_norm": 2.064694404602051, "learning_rate": 5.197002296068878e-06, "loss": 0.3491, "step": 169 }, { "epoch": 0.5466237942122186, "grad_norm": 2.5412817001342773, "learning_rate": 5.140733769922525e-06, "loss": 0.3323, "step": 170 }, { "epoch": 0.5498392282958199, "grad_norm": 1.7079787254333496, "learning_rate": 5.084447400069656e-06, "loss": 0.3382, "step": 171 }, { "epoch": 0.5530546623794212, "grad_norm": 1.8138501644134521, "learning_rate": 5.0281503230878304e-06, "loss": 0.3424, "step": 172 }, { "epoch": 0.5562700964630225, "grad_norm": 3.9634087085723877, "learning_rate": 4.971849676912172e-06, "loss": 0.3357, "step": 173 }, { "epoch": 0.5594855305466238, "grad_norm": 2.114734172821045, "learning_rate": 4.915552599930345e-06, "loss": 0.3544, "step": 174 }, { "epoch": 0.5627009646302251, "grad_norm": 1.3108409643173218, "learning_rate": 4.859266230077474e-06, "loss": 0.3134, "step": 175 }, { "epoch": 0.5659163987138264, "grad_norm": 1.882356882095337, "learning_rate": 4.802997703931124e-06, "loss": 0.3472, "step": 176 }, { "epoch": 0.5691318327974276, "grad_norm": 3.0320799350738525, "learning_rate": 4.746754155806437e-06, "loss": 0.3484, "step": 177 }, { "epoch": 0.572347266881029, "grad_norm": 1.5372339487075806, "learning_rate": 4.6905427168515914e-06, "loss": 0.3511, "step": 178 }, { "epoch": 0.5755627009646302, "grad_norm": 2.2504475116729736, "learning_rate": 4.63437051414362e-06, "loss": 0.3823, "step": 179 }, { "epoch": 0.5787781350482315, "grad_norm": 2.124473810195923, "learning_rate": 4.5782446697847775e-06, "loss": 0.3537, "step": 180 }, { "epoch": 0.5819935691318328, "grad_norm": 1.7836929559707642, "learning_rate": 4.52217229999951e-06, "loss": 0.3281, "step": 181 }, { "epoch": 0.5852090032154341, "grad_norm": 1.819919228553772, "learning_rate": 4.466160514232206e-06, "loss": 0.3307, "step": 182 }, { "epoch": 0.5884244372990354, "grad_norm": 2.2056925296783447, "learning_rate": 4.410216414245771e-06, "loss": 0.3289, "step": 183 }, { "epoch": 0.5916398713826366, "grad_norm": 1.8819239139556885, "learning_rate": 4.354347093221194e-06, "loss": 0.3139, "step": 184 }, { "epoch": 0.594855305466238, "grad_norm": 1.8985276222229004, "learning_rate": 4.298559634858202e-06, "loss": 0.3249, "step": 185 }, { "epoch": 0.5980707395498392, "grad_norm": 3.259624481201172, "learning_rate": 4.2428611124771184e-06, "loss": 0.3566, "step": 186 }, { "epoch": 0.6012861736334405, "grad_norm": 2.25924015045166, "learning_rate": 4.187258588122019e-06, "loss": 0.3359, "step": 187 }, { "epoch": 0.6045016077170418, "grad_norm": 1.6317423582077026, "learning_rate": 4.131759111665349e-06, "loss": 0.3231, "step": 188 }, { "epoch": 0.6077170418006431, "grad_norm": 1.4268584251403809, "learning_rate": 4.076369719914055e-06, "loss": 0.3621, "step": 189 }, { "epoch": 0.6109324758842444, "grad_norm": 1.7488436698913574, "learning_rate": 4.021097435717386e-06, "loss": 0.3263, "step": 190 }, { "epoch": 0.6141479099678456, "grad_norm": 2.6326963901519775, "learning_rate": 3.965949267076465e-06, "loss": 0.3376, "step": 191 }, { "epoch": 0.617363344051447, "grad_norm": 1.5136549472808838, "learning_rate": 3.910932206255742e-06, "loss": 0.3161, "step": 192 }, { "epoch": 0.6205787781350482, "grad_norm": 1.4793072938919067, "learning_rate": 3.856053228896442e-06, "loss": 0.3241, "step": 193 }, { "epoch": 0.6237942122186495, "grad_norm": 2.289064884185791, "learning_rate": 3.8013192931321095e-06, "loss": 0.3207, "step": 194 }, { "epoch": 0.6270096463022508, "grad_norm": 1.8162267208099365, "learning_rate": 3.7467373387063973e-06, "loss": 0.3242, "step": 195 }, { "epoch": 0.6302250803858521, "grad_norm": 1.8329249620437622, "learning_rate": 3.692314286093167e-06, "loss": 0.3248, "step": 196 }, { "epoch": 0.6334405144694534, "grad_norm": 1.6766780614852905, "learning_rate": 3.6380570356190346e-06, "loss": 0.3291, "step": 197 }, { "epoch": 0.6366559485530546, "grad_norm": 66.03868865966797, "learning_rate": 3.58397246658848e-06, "loss": 0.3078, "step": 198 }, { "epoch": 0.639871382636656, "grad_norm": 2.198519706726074, "learning_rate": 3.5300674364116173e-06, "loss": 0.3197, "step": 199 }, { "epoch": 0.6430868167202572, "grad_norm": 2.2276482582092285, "learning_rate": 3.476348779734732e-06, "loss": 0.3141, "step": 200 }, { "epoch": 0.6463022508038585, "grad_norm": 1.4550248384475708, "learning_rate": 3.4228233075737225e-06, "loss": 0.3327, "step": 201 }, { "epoch": 0.6495176848874598, "grad_norm": 1.832467794418335, "learning_rate": 3.3694978064505258e-06, "loss": 0.3196, "step": 202 }, { "epoch": 0.6527331189710611, "grad_norm": 1.8890045881271362, "learning_rate": 3.316379037532644e-06, "loss": 0.355, "step": 203 }, { "epoch": 0.6559485530546624, "grad_norm": 2.667874813079834, "learning_rate": 3.2634737357758994e-06, "loss": 0.3358, "step": 204 }, { "epoch": 0.6591639871382636, "grad_norm": 1.426277756690979, "learning_rate": 3.2107886090705035e-06, "loss": 0.3134, "step": 205 }, { "epoch": 0.662379421221865, "grad_norm": 2.3840363025665283, "learning_rate": 3.158330337390565e-06, "loss": 0.3144, "step": 206 }, { "epoch": 0.6655948553054662, "grad_norm": 1.9086871147155762, "learning_rate": 3.10610557194712e-06, "loss": 0.3043, "step": 207 }, { "epoch": 0.6688102893890675, "grad_norm": 1.7807660102844238, "learning_rate": 3.0541209343448373e-06, "loss": 0.3121, "step": 208 }, { "epoch": 0.6720257234726688, "grad_norm": 1.6139692068099976, "learning_rate": 3.0023830157424504e-06, "loss": 0.3454, "step": 209 }, { "epoch": 0.6752411575562701, "grad_norm": 1.9122083187103271, "learning_rate": 2.950898376017064e-06, "loss": 0.3175, "step": 210 }, { "epoch": 0.6784565916398714, "grad_norm": 2.084561586380005, "learning_rate": 2.8996735429324256e-06, "loss": 0.319, "step": 211 }, { "epoch": 0.6816720257234726, "grad_norm": 1.411837100982666, "learning_rate": 2.848715011311271e-06, "loss": 0.3085, "step": 212 }, { "epoch": 0.684887459807074, "grad_norm": 1.4443124532699585, "learning_rate": 2.7980292422118282e-06, "loss": 0.3366, "step": 213 }, { "epoch": 0.6881028938906752, "grad_norm": 1.447710394859314, "learning_rate": 2.7476226621086354e-06, "loss": 0.3171, "step": 214 }, { "epoch": 0.6913183279742765, "grad_norm": 1.3682136535644531, "learning_rate": 2.697501662077707e-06, "loss": 0.3158, "step": 215 }, { "epoch": 0.6945337620578779, "grad_norm": 8.954407691955566, "learning_rate": 2.6476725969862227e-06, "loss": 0.3474, "step": 216 }, { "epoch": 0.6977491961414791, "grad_norm": 1.565764307975769, "learning_rate": 2.5981417846867753e-06, "loss": 0.3287, "step": 217 }, { "epoch": 0.7009646302250804, "grad_norm": 1.5535356998443604, "learning_rate": 2.548915505216333e-06, "loss": 0.3206, "step": 218 }, { "epoch": 0.7041800643086816, "grad_norm": 1.5272207260131836, "learning_rate": 2.5000000000000015e-06, "loss": 0.3228, "step": 219 }, { "epoch": 0.707395498392283, "grad_norm": 1.4562255144119263, "learning_rate": 2.4514014710596467e-06, "loss": 0.296, "step": 220 }, { "epoch": 0.7106109324758842, "grad_norm": 1.6484540700912476, "learning_rate": 2.4031260802275623e-06, "loss": 0.3358, "step": 221 }, { "epoch": 0.7138263665594855, "grad_norm": 1.4094409942626953, "learning_rate": 2.3551799483651894e-06, "loss": 0.3332, "step": 222 }, { "epoch": 0.7170418006430869, "grad_norm": 1.8686710596084595, "learning_rate": 2.307569154587056e-06, "loss": 0.3356, "step": 223 }, { "epoch": 0.7202572347266881, "grad_norm": 1.5808024406433105, "learning_rate": 2.2602997354900075e-06, "loss": 0.315, "step": 224 }, { "epoch": 0.7234726688102894, "grad_norm": 1.5298748016357422, "learning_rate": 2.2133776843878185e-06, "loss": 0.3355, "step": 225 }, { "epoch": 0.7266881028938906, "grad_norm": 12.085217475891113, "learning_rate": 2.166808950551296e-06, "loss": 0.3301, "step": 226 }, { "epoch": 0.729903536977492, "grad_norm": 1.5451043844223022, "learning_rate": 2.120599438453968e-06, "loss": 0.3321, "step": 227 }, { "epoch": 0.7331189710610932, "grad_norm": 1.8228951692581177, "learning_rate": 2.074755007023461e-06, "loss": 0.3074, "step": 228 }, { "epoch": 0.7363344051446945, "grad_norm": 1.5614581108093262, "learning_rate": 2.0292814688986375e-06, "loss": 0.342, "step": 229 }, { "epoch": 0.7395498392282959, "grad_norm": 1.4238361120224, "learning_rate": 1.9841845896926022e-06, "loss": 0.3261, "step": 230 }, { "epoch": 0.7427652733118971, "grad_norm": 1.7577193975448608, "learning_rate": 1.9394700872616856e-06, "loss": 0.3312, "step": 231 }, { "epoch": 0.7459807073954984, "grad_norm": 2.6132872104644775, "learning_rate": 1.8951436309804766e-06, "loss": 0.341, "step": 232 }, { "epoch": 0.7491961414790996, "grad_norm": 1.9483258724212646, "learning_rate": 1.8512108410229878e-06, "loss": 0.3039, "step": 233 }, { "epoch": 0.752411575562701, "grad_norm": 1.289170742034912, "learning_rate": 1.8076772876500831e-06, "loss": 0.3003, "step": 234 }, { "epoch": 0.7556270096463023, "grad_norm": 2.7016942501068115, "learning_rate": 1.7645484905032129e-06, "loss": 0.3283, "step": 235 }, { "epoch": 0.7588424437299035, "grad_norm": 1.3249751329421997, "learning_rate": 1.7218299179045789e-06, "loss": 0.3128, "step": 236 }, { "epoch": 0.7620578778135049, "grad_norm": 2.7184793949127197, "learning_rate": 1.6795269861638041e-06, "loss": 0.3338, "step": 237 }, { "epoch": 0.7652733118971061, "grad_norm": 1.27732253074646, "learning_rate": 1.6376450588911985e-06, "loss": 0.2865, "step": 238 }, { "epoch": 0.7684887459807074, "grad_norm": 1.7141942977905273, "learning_rate": 1.5961894463176942e-06, "loss": 0.3154, "step": 239 }, { "epoch": 0.7717041800643086, "grad_norm": 4.537605285644531, "learning_rate": 1.555165404621567e-06, "loss": 0.3207, "step": 240 }, { "epoch": 0.77491961414791, "grad_norm": 1.5014591217041016, "learning_rate": 1.5145781352620054e-06, "loss": 0.3403, "step": 241 }, { "epoch": 0.7781350482315113, "grad_norm": 1.7332754135131836, "learning_rate": 1.4744327843196043e-06, "loss": 0.2983, "step": 242 }, { "epoch": 0.7813504823151125, "grad_norm": 1.8262887001037598, "learning_rate": 1.434734441843899e-06, "loss": 0.3052, "step": 243 }, { "epoch": 0.7845659163987139, "grad_norm": 3.564021348953247, "learning_rate": 1.3954881412079945e-06, "loss": 0.3155, "step": 244 }, { "epoch": 0.7877813504823151, "grad_norm": 1.3386446237564087, "learning_rate": 1.3566988584703817e-06, "loss": 0.29, "step": 245 }, { "epoch": 0.7909967845659164, "grad_norm": 14.863595962524414, "learning_rate": 1.3183715117440143e-06, "loss": 0.303, "step": 246 }, { "epoch": 0.7942122186495176, "grad_norm": 10.220840454101562, "learning_rate": 1.280510960572745e-06, "loss": 0.3258, "step": 247 }, { "epoch": 0.797427652733119, "grad_norm": 1.610312819480896, "learning_rate": 1.2431220053151832e-06, "loss": 0.3301, "step": 248 }, { "epoch": 0.8006430868167203, "grad_norm": 1.4349790811538696, "learning_rate": 1.2062093865360458e-06, "loss": 0.2993, "step": 249 }, { "epoch": 0.8038585209003215, "grad_norm": 3.160371780395508, "learning_rate": 1.1697777844051105e-06, "loss": 0.3119, "step": 250 }, { "epoch": 0.8070739549839229, "grad_norm": 1.5855354070663452, "learning_rate": 1.1338318181038037e-06, "loss": 0.3017, "step": 251 }, { "epoch": 0.8102893890675241, "grad_norm": 2.18475341796875, "learning_rate": 1.0983760452395415e-06, "loss": 0.3205, "step": 252 }, { "epoch": 0.8135048231511254, "grad_norm": 1.4034461975097656, "learning_rate": 1.063414961267859e-06, "loss": 0.3265, "step": 253 }, { "epoch": 0.8167202572347267, "grad_norm": 1.7966201305389404, "learning_rate": 1.02895299892244e-06, "loss": 0.3048, "step": 254 }, { "epoch": 0.819935691318328, "grad_norm": 1.4181287288665771, "learning_rate": 9.949945276530782e-07, "loss": 0.327, "step": 255 }, { "epoch": 0.8231511254019293, "grad_norm": 1.8698980808258057, "learning_rate": 9.615438530716753e-07, "loss": 0.304, "step": 256 }, { "epoch": 0.8263665594855305, "grad_norm": 1.4744179248809814, "learning_rate": 9.286052164063369e-07, "loss": 0.3335, "step": 257 }, { "epoch": 0.8295819935691319, "grad_norm": 1.7521005868911743, "learning_rate": 8.961827939636198e-07, "loss": 0.3438, "step": 258 }, { "epoch": 0.8327974276527331, "grad_norm": 2.057781934738159, "learning_rate": 8.64280696599008e-07, "loss": 0.3156, "step": 259 }, { "epoch": 0.8360128617363344, "grad_norm": 1.6581817865371704, "learning_rate": 8.329029691957124e-07, "loss": 0.3126, "step": 260 }, { "epoch": 0.8392282958199357, "grad_norm": 2.1192777156829834, "learning_rate": 8.02053590151805e-07, "loss": 0.3188, "step": 261 }, { "epoch": 0.842443729903537, "grad_norm": 1.4133118391036987, "learning_rate": 7.717364708758024e-07, "loss": 0.3211, "step": 262 }, { "epoch": 0.8456591639871383, "grad_norm": 1.5940773487091064, "learning_rate": 7.41955455290726e-07, "loss": 0.2978, "step": 263 }, { "epoch": 0.8488745980707395, "grad_norm": 2.956995725631714, "learning_rate": 7.127143193467445e-07, "loss": 0.3173, "step": 264 }, { "epoch": 0.8520900321543409, "grad_norm": 2.5843443870544434, "learning_rate": 6.840167705424106e-07, "loss": 0.3002, "step": 265 }, { "epoch": 0.8553054662379421, "grad_norm": 1.8771467208862305, "learning_rate": 6.558664474545817e-07, "loss": 0.3243, "step": 266 }, { "epoch": 0.8585209003215434, "grad_norm": 2.241569757461548, "learning_rate": 6.282669192770896e-07, "loss": 0.2968, "step": 267 }, { "epoch": 0.8617363344051447, "grad_norm": 1.668226957321167, "learning_rate": 6.012216853682001e-07, "loss": 0.32, "step": 268 }, { "epoch": 0.864951768488746, "grad_norm": 1.5583616495132446, "learning_rate": 5.747341748069229e-07, "loss": 0.309, "step": 269 }, { "epoch": 0.8681672025723473, "grad_norm": 1.433117389678955, "learning_rate": 5.488077459582425e-07, "loss": 0.3231, "step": 270 }, { "epoch": 0.8713826366559485, "grad_norm": 1.5444004535675049, "learning_rate": 5.234456860473042e-07, "loss": 0.292, "step": 271 }, { "epoch": 0.8745980707395499, "grad_norm": 1.4406189918518066, "learning_rate": 4.986512107426283e-07, "loss": 0.3043, "step": 272 }, { "epoch": 0.8778135048231511, "grad_norm": 1.3344569206237793, "learning_rate": 4.7442746374839363e-07, "loss": 0.2818, "step": 273 }, { "epoch": 0.8810289389067524, "grad_norm": 1.5688618421554565, "learning_rate": 4.50777516405847e-07, "loss": 0.295, "step": 274 }, { "epoch": 0.8842443729903537, "grad_norm": 1.4727739095687866, "learning_rate": 4.2770436730388166e-07, "loss": 0.2951, "step": 275 }, { "epoch": 0.887459807073955, "grad_norm": 2.1879146099090576, "learning_rate": 4.05210941898847e-07, "loss": 0.3191, "step": 276 }, { "epoch": 0.8906752411575563, "grad_norm": 1.7236080169677734, "learning_rate": 3.8330009214363197e-07, "loss": 0.3118, "step": 277 }, { "epoch": 0.8938906752411575, "grad_norm": 1.4538291692733765, "learning_rate": 3.619745961260623e-07, "loss": 0.3207, "step": 278 }, { "epoch": 0.8971061093247589, "grad_norm": 1.4028717279434204, "learning_rate": 3.4123715771665786e-07, "loss": 0.3278, "step": 279 }, { "epoch": 0.9003215434083601, "grad_norm": 1.755362629890442, "learning_rate": 3.2109040622582186e-07, "loss": 0.2798, "step": 280 }, { "epoch": 0.9035369774919614, "grad_norm": 2.1135966777801514, "learning_rate": 3.015368960704584e-07, "loss": 0.307, "step": 281 }, { "epoch": 0.9067524115755627, "grad_norm": 1.4916549921035767, "learning_rate": 2.8257910645009935e-07, "loss": 0.2861, "step": 282 }, { "epoch": 0.909967845659164, "grad_norm": 1.6314096450805664, "learning_rate": 2.6421944103256657e-07, "loss": 0.3065, "step": 283 }, { "epoch": 0.9131832797427653, "grad_norm": 2.6644771099090576, "learning_rate": 2.4646022764920843e-07, "loss": 0.3013, "step": 284 }, { "epoch": 0.9163987138263665, "grad_norm": 1.4383426904678345, "learning_rate": 2.2930371799975593e-07, "loss": 0.309, "step": 285 }, { "epoch": 0.9196141479099679, "grad_norm": 1.4383573532104492, "learning_rate": 2.1275208736682262e-07, "loss": 0.2973, "step": 286 }, { "epoch": 0.9228295819935691, "grad_norm": 1.249383568763733, "learning_rate": 1.9680743434010385e-07, "loss": 0.307, "step": 287 }, { "epoch": 0.9260450160771704, "grad_norm": 3.4106099605560303, "learning_rate": 1.814717805502958e-07, "loss": 0.3035, "step": 288 }, { "epoch": 0.9292604501607717, "grad_norm": 2.925081729888916, "learning_rate": 1.667470704127694e-07, "loss": 0.2966, "step": 289 }, { "epoch": 0.932475884244373, "grad_norm": 3.2131919860839844, "learning_rate": 1.5263517088103862e-07, "loss": 0.3021, "step": 290 }, { "epoch": 0.9356913183279743, "grad_norm": 1.7186495065689087, "learning_rate": 1.3913787121004717e-07, "loss": 0.3164, "step": 291 }, { "epoch": 0.9389067524115756, "grad_norm": 1.486461877822876, "learning_rate": 1.2625688272930925e-07, "loss": 0.3191, "step": 292 }, { "epoch": 0.9421221864951769, "grad_norm": 1.584938406944275, "learning_rate": 1.1399383862592928e-07, "loss": 0.3005, "step": 293 }, { "epoch": 0.9453376205787781, "grad_norm": 3.820082426071167, "learning_rate": 1.0235029373752758e-07, "loss": 0.3019, "step": 294 }, { "epoch": 0.9485530546623794, "grad_norm": 1.3474429845809937, "learning_rate": 9.132772435510362e-08, "loss": 0.2809, "step": 295 }, { "epoch": 0.9517684887459807, "grad_norm": 1.2956693172454834, "learning_rate": 8.092752803585513e-08, "loss": 0.2959, "step": 296 }, { "epoch": 0.954983922829582, "grad_norm": 7.804889678955078, "learning_rate": 7.115102342598101e-08, "loss": 0.2872, "step": 297 }, { "epoch": 0.9581993569131833, "grad_norm": 3.1022164821624756, "learning_rate": 6.199945009349173e-08, "loss": 0.3173, "step": 298 }, { "epoch": 0.9614147909967846, "grad_norm": 4.706262111663818, "learning_rate": 5.3473968371040575e-08, "loss": 0.2896, "step": 299 }, { "epoch": 0.9646302250803859, "grad_norm": 1.4467219114303589, "learning_rate": 4.55756592088058e-08, "loss": 0.2965, "step": 300 }, { "epoch": 0.9678456591639871, "grad_norm": 1.4120930433273315, "learning_rate": 3.8305524037438035e-08, "loss": 0.3084, "step": 301 }, { "epoch": 0.9710610932475884, "grad_norm": 1.5261682271957397, "learning_rate": 3.166448464108629e-08, "loss": 0.328, "step": 302 }, { "epoch": 0.9742765273311897, "grad_norm": 1.4508821964263916, "learning_rate": 2.5653383040524228e-08, "loss": 0.2849, "step": 303 }, { "epoch": 0.977491961414791, "grad_norm": 6.040746212005615, "learning_rate": 2.0272981386393332e-08, "loss": 0.3468, "step": 304 }, { "epoch": 0.9807073954983923, "grad_norm": 2.0974323749542236, "learning_rate": 1.552396186256411e-08, "loss": 0.2976, "step": 305 }, { "epoch": 0.9839228295819936, "grad_norm": 1.473928451538086, "learning_rate": 1.1406926599646373e-08, "loss": 0.3228, "step": 306 }, { "epoch": 0.9871382636655949, "grad_norm": 2.54904842376709, "learning_rate": 7.922397598642551e-09, "loss": 0.2999, "step": 307 }, { "epoch": 0.9903536977491961, "grad_norm": 1.7829190492630005, "learning_rate": 5.0708166647628345e-09, "loss": 0.3042, "step": 308 }, { "epoch": 0.9935691318327974, "grad_norm": 1.8181827068328857, "learning_rate": 2.8525453514099966e-09, "loss": 0.3057, "step": 309 }, { "epoch": 0.9967845659163987, "grad_norm": 3.868682384490967, "learning_rate": 1.2678649143349485e-09, "loss": 0.3086, "step": 310 }, { "epoch": 1.0, "grad_norm": 1.5854872465133667, "learning_rate": 3.1697627597970794e-10, "loss": 0.3017, "step": 311 }, { "epoch": 1.0, "step": 311, "total_flos": 3.149345964351816e+17, "train_loss": 0.4079481167808606, "train_runtime": 5827.269, "train_samples_per_second": 3.415, "train_steps_per_second": 0.053 } ], "logging_steps": 1, "max_steps": 311, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.149345964351816e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }