{ "best_global_step": 264, "best_metric": 1.308773159980774, "best_model_checkpoint": "saves/qwen3-4B/medical-o1-sft-full-1e-5/checkpoint-264", "epoch": 3.0, "eval_steps": 44, "global_step": 441, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006837606837606838, "grad_norm": 24.729957580566406, "learning_rate": 0.0, "loss": 2.180166482925415, "step": 1 }, { "epoch": 0.013675213675213675, "grad_norm": 25.152711868286133, "learning_rate": 4.347826086956522e-07, "loss": 2.1789543628692627, "step": 2 }, { "epoch": 0.020512820512820513, "grad_norm": 24.6761417388916, "learning_rate": 8.695652173913044e-07, "loss": 2.204561233520508, "step": 3 }, { "epoch": 0.02735042735042735, "grad_norm": 24.276906967163086, "learning_rate": 1.3043478260869566e-06, "loss": 2.1825883388519287, "step": 4 }, { "epoch": 0.03418803418803419, "grad_norm": 23.327831268310547, "learning_rate": 1.7391304347826088e-06, "loss": 2.2022361755371094, "step": 5 }, { "epoch": 0.041025641025641026, "grad_norm": 20.180011749267578, "learning_rate": 2.173913043478261e-06, "loss": 2.0757670402526855, "step": 6 }, { "epoch": 0.04786324786324787, "grad_norm": 18.820642471313477, "learning_rate": 2.6086956521739132e-06, "loss": 2.024721145629883, "step": 7 }, { "epoch": 0.0547008547008547, "grad_norm": 13.223835945129395, "learning_rate": 3.043478260869566e-06, "loss": 1.9034565687179565, "step": 8 }, { "epoch": 0.06153846153846154, "grad_norm": 11.584263801574707, "learning_rate": 3.4782608695652175e-06, "loss": 1.8130236864089966, "step": 9 }, { "epoch": 0.06837606837606838, "grad_norm": 5.6841607093811035, "learning_rate": 3.91304347826087e-06, "loss": 1.6309248208999634, "step": 10 }, { "epoch": 0.07521367521367521, "grad_norm": 4.208008766174316, "learning_rate": 4.347826086956522e-06, "loss": 1.5361576080322266, "step": 11 }, { "epoch": 0.08205128205128205, "grad_norm": 3.528555154800415, "learning_rate": 4.782608695652174e-06, "loss": 1.6088225841522217, "step": 12 }, { "epoch": 0.08888888888888889, "grad_norm": 3.099165916442871, "learning_rate": 5.2173913043478265e-06, "loss": 1.5432047843933105, "step": 13 }, { "epoch": 0.09572649572649573, "grad_norm": 6.412608623504639, "learning_rate": 5.652173913043479e-06, "loss": 1.5963867902755737, "step": 14 }, { "epoch": 0.10256410256410256, "grad_norm": 5.609615802764893, "learning_rate": 6.086956521739132e-06, "loss": 1.5698325634002686, "step": 15 }, { "epoch": 0.1094017094017094, "grad_norm": 4.161319255828857, "learning_rate": 6.521739130434783e-06, "loss": 1.555444598197937, "step": 16 }, { "epoch": 0.11623931623931624, "grad_norm": 3.2057743072509766, "learning_rate": 6.956521739130435e-06, "loss": 1.475843906402588, "step": 17 }, { "epoch": 0.12307692307692308, "grad_norm": 2.5646772384643555, "learning_rate": 7.391304347826087e-06, "loss": 1.509574294090271, "step": 18 }, { "epoch": 0.12991452991452992, "grad_norm": 1.9250593185424805, "learning_rate": 7.82608695652174e-06, "loss": 1.4932482242584229, "step": 19 }, { "epoch": 0.13675213675213677, "grad_norm": 1.6663166284561157, "learning_rate": 8.260869565217392e-06, "loss": 1.4706228971481323, "step": 20 }, { "epoch": 0.14358974358974358, "grad_norm": 1.488690733909607, "learning_rate": 8.695652173913044e-06, "loss": 1.4192920923233032, "step": 21 }, { "epoch": 0.15042735042735042, "grad_norm": 1.3503153324127197, "learning_rate": 9.130434782608697e-06, "loss": 1.427452802658081, "step": 22 }, { "epoch": 0.15726495726495726, "grad_norm": 1.2214534282684326, "learning_rate": 9.565217391304349e-06, "loss": 1.4610393047332764, "step": 23 }, { "epoch": 0.1641025641025641, "grad_norm": 1.1983873844146729, "learning_rate": 1e-05, "loss": 1.4273948669433594, "step": 24 }, { "epoch": 0.17094017094017094, "grad_norm": 1.1930960416793823, "learning_rate": 9.999858783596665e-06, "loss": 1.4003199338912964, "step": 25 }, { "epoch": 0.17777777777777778, "grad_norm": 1.0275226831436157, "learning_rate": 9.999435142363484e-06, "loss": 1.4090672731399536, "step": 26 }, { "epoch": 0.18461538461538463, "grad_norm": 1.001726508140564, "learning_rate": 9.998729100230497e-06, "loss": 1.3982799053192139, "step": 27 }, { "epoch": 0.19145299145299147, "grad_norm": 0.9476358890533447, "learning_rate": 9.997740697079595e-06, "loss": 1.4250205755233765, "step": 28 }, { "epoch": 0.19829059829059828, "grad_norm": 0.9169353246688843, "learning_rate": 9.99646998874227e-06, "loss": 1.407841682434082, "step": 29 }, { "epoch": 0.20512820512820512, "grad_norm": 0.9049670696258545, "learning_rate": 9.994917046996472e-06, "loss": 1.4163107872009277, "step": 30 }, { "epoch": 0.21196581196581196, "grad_norm": 0.902590811252594, "learning_rate": 9.993081959562539e-06, "loss": 1.4395619630813599, "step": 31 }, { "epoch": 0.2188034188034188, "grad_norm": 0.9725260138511658, "learning_rate": 9.990964830098246e-06, "loss": 1.4067661762237549, "step": 32 }, { "epoch": 0.22564102564102564, "grad_norm": 0.8750798106193542, "learning_rate": 9.98856577819296e-06, "loss": 1.4079771041870117, "step": 33 }, { "epoch": 0.23247863247863249, "grad_norm": 0.8549812436103821, "learning_rate": 9.985884939360873e-06, "loss": 1.398482322692871, "step": 34 }, { "epoch": 0.23931623931623933, "grad_norm": 0.869503378868103, "learning_rate": 9.98292246503335e-06, "loss": 1.344150424003601, "step": 35 }, { "epoch": 0.24615384615384617, "grad_norm": 0.9242067337036133, "learning_rate": 9.979678522550382e-06, "loss": 1.37479567527771, "step": 36 }, { "epoch": 0.252991452991453, "grad_norm": 0.8416987657546997, "learning_rate": 9.976153295151123e-06, "loss": 1.3731480836868286, "step": 37 }, { "epoch": 0.25982905982905985, "grad_norm": 0.9907390475273132, "learning_rate": 9.972346981963546e-06, "loss": 1.3624351024627686, "step": 38 }, { "epoch": 0.26666666666666666, "grad_norm": 0.8205696940422058, "learning_rate": 9.968259797993197e-06, "loss": 1.3645293712615967, "step": 39 }, { "epoch": 0.27350427350427353, "grad_norm": 0.8257843852043152, "learning_rate": 9.963891974111042e-06, "loss": 1.3727067708969116, "step": 40 }, { "epoch": 0.28034188034188035, "grad_norm": 0.7986466288566589, "learning_rate": 9.959243757040434e-06, "loss": 1.3945657014846802, "step": 41 }, { "epoch": 0.28717948717948716, "grad_norm": 0.9684669971466064, "learning_rate": 9.95431540934317e-06, "loss": 1.3376381397247314, "step": 42 }, { "epoch": 0.294017094017094, "grad_norm": 0.7717859148979187, "learning_rate": 9.949107209404664e-06, "loss": 1.354946494102478, "step": 43 }, { "epoch": 0.30085470085470084, "grad_norm": 0.8021324276924133, "learning_rate": 9.943619451418225e-06, "loss": 1.3951725959777832, "step": 44 }, { "epoch": 0.30085470085470084, "eval_loss": 1.362805724143982, "eval_runtime": 24.9887, "eval_samples_per_second": 39.458, "eval_steps_per_second": 4.962, "step": 44 }, { "epoch": 0.3076923076923077, "grad_norm": 0.829911470413208, "learning_rate": 9.937852445368427e-06, "loss": 1.3832783699035645, "step": 45 }, { "epoch": 0.3145299145299145, "grad_norm": 0.8109715580940247, "learning_rate": 9.931806517013612e-06, "loss": 1.3637301921844482, "step": 46 }, { "epoch": 0.3213675213675214, "grad_norm": 0.7627991437911987, "learning_rate": 9.925482007867485e-06, "loss": 1.3353031873703003, "step": 47 }, { "epoch": 0.3282051282051282, "grad_norm": 0.7720788717269897, "learning_rate": 9.918879275179819e-06, "loss": 1.367252230644226, "step": 48 }, { "epoch": 0.335042735042735, "grad_norm": 0.7520493865013123, "learning_rate": 9.911998691916275e-06, "loss": 1.386542797088623, "step": 49 }, { "epoch": 0.3418803418803419, "grad_norm": 0.7559177875518799, "learning_rate": 9.904840646737346e-06, "loss": 1.3789976835250854, "step": 50 }, { "epoch": 0.3487179487179487, "grad_norm": 0.770207405090332, "learning_rate": 9.89740554397639e-06, "loss": 1.356705904006958, "step": 51 }, { "epoch": 0.35555555555555557, "grad_norm": 0.7609772086143494, "learning_rate": 9.889693803616793e-06, "loss": 1.3461980819702148, "step": 52 }, { "epoch": 0.3623931623931624, "grad_norm": 0.7604424953460693, "learning_rate": 9.881705861268252e-06, "loss": 1.344923496246338, "step": 53 }, { "epoch": 0.36923076923076925, "grad_norm": 0.7701961398124695, "learning_rate": 9.873442168142158e-06, "loss": 1.364449143409729, "step": 54 }, { "epoch": 0.37606837606837606, "grad_norm": 0.7939377427101135, "learning_rate": 9.864903191026125e-06, "loss": 1.4013525247573853, "step": 55 }, { "epoch": 0.38290598290598293, "grad_norm": 0.7690542340278625, "learning_rate": 9.856089412257605e-06, "loss": 1.3586581945419312, "step": 56 }, { "epoch": 0.38974358974358975, "grad_norm": 0.798068106174469, "learning_rate": 9.847001329696653e-06, "loss": 1.3378022909164429, "step": 57 }, { "epoch": 0.39658119658119656, "grad_norm": 0.7824757695198059, "learning_rate": 9.837639456697802e-06, "loss": 1.3118129968643188, "step": 58 }, { "epoch": 0.40341880341880343, "grad_norm": 0.7629351019859314, "learning_rate": 9.828004322081067e-06, "loss": 1.3393217325210571, "step": 59 }, { "epoch": 0.41025641025641024, "grad_norm": 0.7708514332771301, "learning_rate": 9.818096470102067e-06, "loss": 1.3732938766479492, "step": 60 }, { "epoch": 0.4170940170940171, "grad_norm": 0.8133201003074646, "learning_rate": 9.807916460421294e-06, "loss": 1.3423891067504883, "step": 61 }, { "epoch": 0.4239316239316239, "grad_norm": 0.7727287411689758, "learning_rate": 9.797464868072489e-06, "loss": 1.3378151655197144, "step": 62 }, { "epoch": 0.4307692307692308, "grad_norm": 0.7684638500213623, "learning_rate": 9.78674228343016e-06, "loss": 1.3335256576538086, "step": 63 }, { "epoch": 0.4376068376068376, "grad_norm": 0.7602411508560181, "learning_rate": 9.775749312176249e-06, "loss": 1.3320605754852295, "step": 64 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8044481873512268, "learning_rate": 9.764486575265893e-06, "loss": 1.3325685262680054, "step": 65 }, { "epoch": 0.4512820512820513, "grad_norm": 0.7876479029655457, "learning_rate": 9.752954708892379e-06, "loss": 1.3242830038070679, "step": 66 }, { "epoch": 0.4581196581196581, "grad_norm": 0.7659040689468384, "learning_rate": 9.741154364451179e-06, "loss": 1.3692903518676758, "step": 67 }, { "epoch": 0.46495726495726497, "grad_norm": 0.8316842317581177, "learning_rate": 9.729086208503174e-06, "loss": 1.344923734664917, "step": 68 }, { "epoch": 0.4717948717948718, "grad_norm": 0.8216245174407959, "learning_rate": 9.716750922736998e-06, "loss": 1.3780957460403442, "step": 69 }, { "epoch": 0.47863247863247865, "grad_norm": 0.7839699387550354, "learning_rate": 9.704149203930522e-06, "loss": 1.3786989450454712, "step": 70 }, { "epoch": 0.48547008547008547, "grad_norm": 0.7707169055938721, "learning_rate": 9.691281763911513e-06, "loss": 1.3283625841140747, "step": 71 }, { "epoch": 0.49230769230769234, "grad_norm": 0.7598075270652771, "learning_rate": 9.67814932951741e-06, "loss": 1.3375245332717896, "step": 72 }, { "epoch": 0.49914529914529915, "grad_norm": 0.8022596836090088, "learning_rate": 9.664752642554272e-06, "loss": 1.3409022092819214, "step": 73 }, { "epoch": 0.505982905982906, "grad_norm": 0.7512302398681641, "learning_rate": 9.651092459754879e-06, "loss": 1.2996271848678589, "step": 74 }, { "epoch": 0.5128205128205128, "grad_norm": 0.7390022277832031, "learning_rate": 9.637169552735985e-06, "loss": 1.3141694068908691, "step": 75 }, { "epoch": 0.5196581196581197, "grad_norm": 0.7599424123764038, "learning_rate": 9.622984707954732e-06, "loss": 1.3220386505126953, "step": 76 }, { "epoch": 0.5264957264957265, "grad_norm": 0.7562436461448669, "learning_rate": 9.608538726664224e-06, "loss": 1.3605300188064575, "step": 77 }, { "epoch": 0.5333333333333333, "grad_norm": 0.7731190919876099, "learning_rate": 9.593832424868271e-06, "loss": 1.3461638689041138, "step": 78 }, { "epoch": 0.5401709401709401, "grad_norm": 0.7543560266494751, "learning_rate": 9.578866633275289e-06, "loss": 1.340885877609253, "step": 79 }, { "epoch": 0.5470085470085471, "grad_norm": 0.772647500038147, "learning_rate": 9.563642197251382e-06, "loss": 1.3663382530212402, "step": 80 }, { "epoch": 0.5538461538461539, "grad_norm": 0.7314751148223877, "learning_rate": 9.548159976772593e-06, "loss": 1.3287297487258911, "step": 81 }, { "epoch": 0.5606837606837607, "grad_norm": 0.7391103506088257, "learning_rate": 9.532420846376316e-06, "loss": 1.3285285234451294, "step": 82 }, { "epoch": 0.5675213675213675, "grad_norm": 0.7641813158988953, "learning_rate": 9.516425695111906e-06, "loss": 1.3269128799438477, "step": 83 }, { "epoch": 0.5743589743589743, "grad_norm": 0.7769819498062134, "learning_rate": 9.500175426490455e-06, "loss": 1.3374706506729126, "step": 84 }, { "epoch": 0.5811965811965812, "grad_norm": 0.7199158668518066, "learning_rate": 9.48367095843376e-06, "loss": 1.3117002248764038, "step": 85 }, { "epoch": 0.588034188034188, "grad_norm": 0.7510148882865906, "learning_rate": 9.466913223222467e-06, "loss": 1.3387565612792969, "step": 86 }, { "epoch": 0.5948717948717949, "grad_norm": 0.7325724363327026, "learning_rate": 9.449903167443415e-06, "loss": 1.269672155380249, "step": 87 }, { "epoch": 0.6017094017094017, "grad_norm": 0.7675944566726685, "learning_rate": 9.432641751936162e-06, "loss": 1.3153454065322876, "step": 88 }, { "epoch": 0.6017094017094017, "eval_loss": 1.3318638801574707, "eval_runtime": 24.6717, "eval_samples_per_second": 39.965, "eval_steps_per_second": 5.026, "step": 88 }, { "epoch": 0.6085470085470085, "grad_norm": 0.7539426684379578, "learning_rate": 9.415129951738713e-06, "loss": 1.378519058227539, "step": 89 }, { "epoch": 0.6153846153846154, "grad_norm": 0.7739952802658081, "learning_rate": 9.397368756032445e-06, "loss": 1.3163981437683105, "step": 90 }, { "epoch": 0.6222222222222222, "grad_norm": 0.7639786005020142, "learning_rate": 9.379359168086231e-06, "loss": 1.3244612216949463, "step": 91 }, { "epoch": 0.629059829059829, "grad_norm": 0.7307687997817993, "learning_rate": 9.361102205199762e-06, "loss": 1.3425580263137817, "step": 92 }, { "epoch": 0.6358974358974359, "grad_norm": 0.7326052188873291, "learning_rate": 9.34259889864609e-06, "loss": 1.349947452545166, "step": 93 }, { "epoch": 0.6427350427350428, "grad_norm": 0.7336087822914124, "learning_rate": 9.32385029361338e-06, "loss": 1.3235843181610107, "step": 94 }, { "epoch": 0.6495726495726496, "grad_norm": 0.7857178449630737, "learning_rate": 9.304857449145858e-06, "loss": 1.29775071144104, "step": 95 }, { "epoch": 0.6564102564102564, "grad_norm": 0.7694044709205627, "learning_rate": 9.285621438083997e-06, "loss": 1.3575528860092163, "step": 96 }, { "epoch": 0.6632478632478632, "grad_norm": 0.7426573634147644, "learning_rate": 9.26614334700392e-06, "loss": 1.334963083267212, "step": 97 }, { "epoch": 0.67008547008547, "grad_norm": 0.7567334175109863, "learning_rate": 9.246424276156008e-06, "loss": 1.335172176361084, "step": 98 }, { "epoch": 0.676923076923077, "grad_norm": 0.733529269695282, "learning_rate": 9.226465339402768e-06, "loss": 1.3033547401428223, "step": 99 }, { "epoch": 0.6837606837606838, "grad_norm": 0.7475197315216064, "learning_rate": 9.206267664155906e-06, "loss": 1.316215991973877, "step": 100 }, { "epoch": 0.6905982905982906, "grad_norm": 0.7870779633522034, "learning_rate": 9.185832391312644e-06, "loss": 1.347679853439331, "step": 101 }, { "epoch": 0.6974358974358974, "grad_norm": 0.764722466468811, "learning_rate": 9.165160675191272e-06, "loss": 1.305860996246338, "step": 102 }, { "epoch": 0.7042735042735043, "grad_norm": 0.7680871486663818, "learning_rate": 9.144253683465953e-06, "loss": 1.3211126327514648, "step": 103 }, { "epoch": 0.7111111111111111, "grad_norm": 0.734742283821106, "learning_rate": 9.123112597100759e-06, "loss": 1.2861220836639404, "step": 104 }, { "epoch": 0.717948717948718, "grad_norm": 0.7347426414489746, "learning_rate": 9.101738610282956e-06, "loss": 1.315138578414917, "step": 105 }, { "epoch": 0.7247863247863248, "grad_norm": 0.7639749646186829, "learning_rate": 9.080132930355567e-06, "loss": 1.3426464796066284, "step": 106 }, { "epoch": 0.7316239316239316, "grad_norm": 0.7904943227767944, "learning_rate": 9.058296777749154e-06, "loss": 1.334005355834961, "step": 107 }, { "epoch": 0.7384615384615385, "grad_norm": 0.780296266078949, "learning_rate": 9.03623138591289e-06, "loss": 1.3893626928329468, "step": 108 }, { "epoch": 0.7452991452991453, "grad_norm": 0.7619044184684753, "learning_rate": 9.013938001244885e-06, "loss": 1.3112680912017822, "step": 109 }, { "epoch": 0.7521367521367521, "grad_norm": 0.7852951884269714, "learning_rate": 8.99141788302178e-06, "loss": 1.3263344764709473, "step": 110 }, { "epoch": 0.7589743589743589, "grad_norm": 0.746293306350708, "learning_rate": 8.968672303327614e-06, "loss": 1.3137162923812866, "step": 111 }, { "epoch": 0.7658119658119659, "grad_norm": 0.7697060704231262, "learning_rate": 8.94570254698197e-06, "loss": 1.305846095085144, "step": 112 }, { "epoch": 0.7726495726495727, "grad_norm": 0.7505799531936646, "learning_rate": 8.922509911467395e-06, "loss": 1.3263046741485596, "step": 113 }, { "epoch": 0.7794871794871795, "grad_norm": 0.7378644347190857, "learning_rate": 8.899095706856122e-06, "loss": 1.2952595949172974, "step": 114 }, { "epoch": 0.7863247863247863, "grad_norm": 0.7393775582313538, "learning_rate": 8.875461255736055e-06, "loss": 1.314041018486023, "step": 115 }, { "epoch": 0.7931623931623931, "grad_norm": 0.7198286056518555, "learning_rate": 8.851607893136065e-06, "loss": 1.301222801208496, "step": 116 }, { "epoch": 0.8, "grad_norm": 0.7539902925491333, "learning_rate": 8.827536966450584e-06, "loss": 1.3459645509719849, "step": 117 }, { "epoch": 0.8068376068376069, "grad_norm": 0.728272020816803, "learning_rate": 8.803249835363486e-06, "loss": 1.3075345754623413, "step": 118 }, { "epoch": 0.8136752136752137, "grad_norm": 0.7353615164756775, "learning_rate": 8.778747871771293e-06, "loss": 1.2967561483383179, "step": 119 }, { "epoch": 0.8205128205128205, "grad_norm": 0.7358576655387878, "learning_rate": 8.754032459705672e-06, "loss": 1.3145124912261963, "step": 120 }, { "epoch": 0.8273504273504273, "grad_norm": 0.7736720442771912, "learning_rate": 8.729104995255265e-06, "loss": 1.3146538734436035, "step": 121 }, { "epoch": 0.8341880341880342, "grad_norm": 0.7337418794631958, "learning_rate": 8.703966886486819e-06, "loss": 1.2823609113693237, "step": 122 }, { "epoch": 0.841025641025641, "grad_norm": 0.7514926195144653, "learning_rate": 8.67861955336566e-06, "loss": 1.3389618396759033, "step": 123 }, { "epoch": 0.8478632478632478, "grad_norm": 0.7190932035446167, "learning_rate": 8.65306442767547e-06, "loss": 1.3115108013153076, "step": 124 }, { "epoch": 0.8547008547008547, "grad_norm": 0.7332461476325989, "learning_rate": 8.627302952937431e-06, "loss": 1.333253264427185, "step": 125 }, { "epoch": 0.8615384615384616, "grad_norm": 0.7428878545761108, "learning_rate": 8.601336584328659e-06, "loss": 1.3187751770019531, "step": 126 }, { "epoch": 0.8683760683760684, "grad_norm": 0.7715012431144714, "learning_rate": 8.575166788600031e-06, "loss": 1.3300316333770752, "step": 127 }, { "epoch": 0.8752136752136752, "grad_norm": 0.7566640973091125, "learning_rate": 8.548795043993316e-06, "loss": 1.307992696762085, "step": 128 }, { "epoch": 0.882051282051282, "grad_norm": 0.7760566473007202, "learning_rate": 8.522222840157687e-06, "loss": 1.32774817943573, "step": 129 }, { "epoch": 0.8888888888888888, "grad_norm": 0.7682384848594666, "learning_rate": 8.495451678065563e-06, "loss": 1.3295447826385498, "step": 130 }, { "epoch": 0.8957264957264958, "grad_norm": 0.7397897839546204, "learning_rate": 8.468483069927832e-06, "loss": 1.3145328760147095, "step": 131 }, { "epoch": 0.9025641025641026, "grad_norm": 0.7603890299797058, "learning_rate": 8.441318539108433e-06, "loss": 1.3174394369125366, "step": 132 }, { "epoch": 0.9025641025641026, "eval_loss": 1.317511796951294, "eval_runtime": 24.6804, "eval_samples_per_second": 39.951, "eval_steps_per_second": 5.024, "step": 132 }, { "epoch": 0.9094017094017094, "grad_norm": 0.7623502612113953, "learning_rate": 8.413959620038306e-06, "loss": 1.3393348455429077, "step": 133 }, { "epoch": 0.9162393162393162, "grad_norm": 0.7669332027435303, "learning_rate": 8.386407858128707e-06, "loss": 1.302769660949707, "step": 134 }, { "epoch": 0.9230769230769231, "grad_norm": 0.7234067320823669, "learning_rate": 8.358664809683926e-06, "loss": 1.3381096124649048, "step": 135 }, { "epoch": 0.9299145299145299, "grad_norm": 0.7574735283851624, "learning_rate": 8.330732041813367e-06, "loss": 1.335377812385559, "step": 136 }, { "epoch": 0.9367521367521368, "grad_norm": 0.7575842142105103, "learning_rate": 8.302611132343042e-06, "loss": 1.3330005407333374, "step": 137 }, { "epoch": 0.9435897435897436, "grad_norm": 0.7127556800842285, "learning_rate": 8.274303669726427e-06, "loss": 1.2971893548965454, "step": 138 }, { "epoch": 0.9504273504273504, "grad_norm": 0.8172794580459595, "learning_rate": 8.245811252954741e-06, "loss": 1.3225749731063843, "step": 139 }, { "epoch": 0.9572649572649573, "grad_norm": 0.7154548764228821, "learning_rate": 8.217135491466636e-06, "loss": 1.2955387830734253, "step": 140 }, { "epoch": 0.9641025641025641, "grad_norm": 0.7610012888908386, "learning_rate": 8.18827800505727e-06, "loss": 1.3369195461273193, "step": 141 }, { "epoch": 0.9709401709401709, "grad_norm": 0.7487711906433105, "learning_rate": 8.15924042378682e-06, "loss": 1.2916451692581177, "step": 142 }, { "epoch": 0.9777777777777777, "grad_norm": 0.7546627521514893, "learning_rate": 8.130024387888402e-06, "loss": 1.310347318649292, "step": 143 }, { "epoch": 0.9846153846153847, "grad_norm": 0.7537707090377808, "learning_rate": 8.100631547675417e-06, "loss": 1.3267855644226074, "step": 144 }, { "epoch": 0.9914529914529915, "grad_norm": 0.7335416078567505, "learning_rate": 8.071063563448341e-06, "loss": 1.2958036661148071, "step": 145 }, { "epoch": 0.9982905982905983, "grad_norm": 0.773562490940094, "learning_rate": 8.041322105400923e-06, "loss": 1.2804107666015625, "step": 146 }, { "epoch": 1.0, "grad_norm": 1.4411433935165405, "learning_rate": 8.01140885352586e-06, "loss": 1.3802165985107422, "step": 147 }, { "epoch": 1.0068376068376068, "grad_norm": 0.9124190211296082, "learning_rate": 7.981325497519892e-06, "loss": 1.2135487794876099, "step": 148 }, { "epoch": 1.0136752136752136, "grad_norm": 0.8284032344818115, "learning_rate": 7.951073736688348e-06, "loss": 1.1935949325561523, "step": 149 }, { "epoch": 1.0205128205128204, "grad_norm": 0.8174305558204651, "learning_rate": 7.920655279849173e-06, "loss": 1.2410966157913208, "step": 150 }, { "epoch": 1.0273504273504273, "grad_norm": 0.7865321040153503, "learning_rate": 7.890071845236395e-06, "loss": 1.2489113807678223, "step": 151 }, { "epoch": 1.0341880341880343, "grad_norm": 0.812463104724884, "learning_rate": 7.859325160403073e-06, "loss": 1.1999475955963135, "step": 152 }, { "epoch": 1.041025641025641, "grad_norm": 0.8780131936073303, "learning_rate": 7.8284169621237e-06, "loss": 1.2193069458007812, "step": 153 }, { "epoch": 1.047863247863248, "grad_norm": 0.8348581790924072, "learning_rate": 7.797348996296116e-06, "loss": 1.1925896406173706, "step": 154 }, { "epoch": 1.0547008547008547, "grad_norm": 0.8675538897514343, "learning_rate": 7.766123017842877e-06, "loss": 1.2143549919128418, "step": 155 }, { "epoch": 1.0615384615384615, "grad_norm": 0.8252431750297546, "learning_rate": 7.734740790612137e-06, "loss": 1.2455641031265259, "step": 156 }, { "epoch": 1.0683760683760684, "grad_norm": 0.8385781049728394, "learning_rate": 7.703204087277989e-06, "loss": 1.2102444171905518, "step": 157 }, { "epoch": 1.0752136752136752, "grad_norm": 0.827889084815979, "learning_rate": 7.671514689240366e-06, "loss": 1.2144052982330322, "step": 158 }, { "epoch": 1.082051282051282, "grad_norm": 0.7633846998214722, "learning_rate": 7.639674386524395e-06, "loss": 1.2118767499923706, "step": 159 }, { "epoch": 1.0888888888888888, "grad_norm": 0.8267090320587158, "learning_rate": 7.607684977679284e-06, "loss": 1.188737392425537, "step": 160 }, { "epoch": 1.0957264957264958, "grad_norm": 0.8270633816719055, "learning_rate": 7.575548269676741e-06, "loss": 1.214994192123413, "step": 161 }, { "epoch": 1.1025641025641026, "grad_norm": 0.8160786628723145, "learning_rate": 7.543266077808893e-06, "loss": 1.221800446510315, "step": 162 }, { "epoch": 1.1094017094017095, "grad_norm": 0.829490065574646, "learning_rate": 7.510840225585749e-06, "loss": 1.1974472999572754, "step": 163 }, { "epoch": 1.1162393162393163, "grad_norm": 0.8170298933982849, "learning_rate": 7.478272544632204e-06, "loss": 1.2150561809539795, "step": 164 }, { "epoch": 1.123076923076923, "grad_norm": 0.7731851935386658, "learning_rate": 7.44556487458456e-06, "loss": 1.1988686323165894, "step": 165 }, { "epoch": 1.12991452991453, "grad_norm": 0.7923320531845093, "learning_rate": 7.412719062986632e-06, "loss": 1.2086683511734009, "step": 166 }, { "epoch": 1.1367521367521367, "grad_norm": 0.7592716217041016, "learning_rate": 7.379736965185369e-06, "loss": 1.215879201889038, "step": 167 }, { "epoch": 1.1435897435897435, "grad_norm": 0.7586809396743774, "learning_rate": 7.3466204442260605e-06, "loss": 1.2311599254608154, "step": 168 }, { "epoch": 1.1504273504273503, "grad_norm": 0.7838971614837646, "learning_rate": 7.313371370747104e-06, "loss": 1.2183728218078613, "step": 169 }, { "epoch": 1.1572649572649572, "grad_norm": 0.7780983448028564, "learning_rate": 7.279991622874319e-06, "loss": 1.1952356100082397, "step": 170 }, { "epoch": 1.1641025641025642, "grad_norm": 0.7715050578117371, "learning_rate": 7.24648308611489e-06, "loss": 1.2417360544204712, "step": 171 }, { "epoch": 1.170940170940171, "grad_norm": 0.7692239880561829, "learning_rate": 7.212847653250828e-06, "loss": 1.2170333862304688, "step": 172 }, { "epoch": 1.1777777777777778, "grad_norm": 0.7896147966384888, "learning_rate": 7.1790872242320775e-06, "loss": 1.2121965885162354, "step": 173 }, { "epoch": 1.1846153846153846, "grad_norm": 0.8173856139183044, "learning_rate": 7.145203706069183e-06, "loss": 1.1911547183990479, "step": 174 }, { "epoch": 1.1914529914529914, "grad_norm": 0.7522553205490112, "learning_rate": 7.1111990127255684e-06, "loss": 1.210161566734314, "step": 175 }, { "epoch": 1.1982905982905983, "grad_norm": 0.7353285551071167, "learning_rate": 7.0770750650094335e-06, "loss": 1.1757725477218628, "step": 176 }, { "epoch": 1.1982905982905983, "eval_loss": 1.3184372186660767, "eval_runtime": 24.8388, "eval_samples_per_second": 39.696, "eval_steps_per_second": 4.992, "step": 176 }, { "epoch": 1.205128205128205, "grad_norm": 0.7701054811477661, "learning_rate": 7.042833790465241e-06, "loss": 1.2243812084197998, "step": 177 }, { "epoch": 1.2119658119658119, "grad_norm": 0.7278676629066467, "learning_rate": 7.008477123264849e-06, "loss": 1.198972463607788, "step": 178 }, { "epoch": 1.218803418803419, "grad_norm": 0.7595424056053162, "learning_rate": 6.974007004098243e-06, "loss": 1.2435779571533203, "step": 179 }, { "epoch": 1.2256410256410257, "grad_norm": 0.7661744952201843, "learning_rate": 6.939425380063924e-06, "loss": 1.2413814067840576, "step": 180 }, { "epoch": 1.2324786324786325, "grad_norm": 0.7790281176567078, "learning_rate": 6.9047342045589224e-06, "loss": 1.1771953105926514, "step": 181 }, { "epoch": 1.2393162393162394, "grad_norm": 0.7655471563339233, "learning_rate": 6.869935437168449e-06, "loss": 1.203190565109253, "step": 182 }, { "epoch": 1.2461538461538462, "grad_norm": 0.784903347492218, "learning_rate": 6.835031043555211e-06, "loss": 1.2171598672866821, "step": 183 }, { "epoch": 1.252991452991453, "grad_norm": 0.7539082765579224, "learning_rate": 6.800022995348381e-06, "loss": 1.2139626741409302, "step": 184 }, { "epoch": 1.2598290598290598, "grad_norm": 0.7623985409736633, "learning_rate": 6.76491327003222e-06, "loss": 1.2187587022781372, "step": 185 }, { "epoch": 1.2666666666666666, "grad_norm": 0.7418251037597656, "learning_rate": 6.729703850834381e-06, "loss": 1.2088682651519775, "step": 186 }, { "epoch": 1.2735042735042734, "grad_norm": 0.7652315497398376, "learning_rate": 6.694396726613883e-06, "loss": 1.2204537391662598, "step": 187 }, { "epoch": 1.2803418803418802, "grad_norm": 0.7618216872215271, "learning_rate": 6.65899389174876e-06, "loss": 1.220557451248169, "step": 188 }, { "epoch": 1.287179487179487, "grad_norm": 0.774918794631958, "learning_rate": 6.6234973460234184e-06, "loss": 1.238166093826294, "step": 189 }, { "epoch": 1.294017094017094, "grad_norm": 0.7822843790054321, "learning_rate": 6.587909094515663e-06, "loss": 1.2424533367156982, "step": 190 }, { "epoch": 1.300854700854701, "grad_norm": 0.7934525012969971, "learning_rate": 6.552231147483448e-06, "loss": 1.1982380151748657, "step": 191 }, { "epoch": 1.3076923076923077, "grad_norm": 0.7817178964614868, "learning_rate": 6.5164655202513135e-06, "loss": 1.205663800239563, "step": 192 }, { "epoch": 1.3145299145299145, "grad_norm": 0.8002380728721619, "learning_rate": 6.480614233096558e-06, "loss": 1.1866426467895508, "step": 193 }, { "epoch": 1.3213675213675213, "grad_norm": 0.7488191723823547, "learning_rate": 6.444679311135112e-06, "loss": 1.2407163381576538, "step": 194 }, { "epoch": 1.3282051282051281, "grad_norm": 0.8069729208946228, "learning_rate": 6.408662784207149e-06, "loss": 1.2296785116195679, "step": 195 }, { "epoch": 1.335042735042735, "grad_norm": 0.8026877641677856, "learning_rate": 6.372566686762427e-06, "loss": 1.228287696838379, "step": 196 }, { "epoch": 1.341880341880342, "grad_norm": 0.7794991731643677, "learning_rate": 6.336393057745365e-06, "loss": 1.2325451374053955, "step": 197 }, { "epoch": 1.3487179487179488, "grad_norm": 0.7851534485816956, "learning_rate": 6.300143940479881e-06, "loss": 1.2433525323867798, "step": 198 }, { "epoch": 1.3555555555555556, "grad_norm": 0.7642512321472168, "learning_rate": 6.2638213825539595e-06, "loss": 1.2330515384674072, "step": 199 }, { "epoch": 1.3623931623931624, "grad_norm": 0.8071786165237427, "learning_rate": 6.227427435703997e-06, "loss": 1.2169106006622314, "step": 200 }, { "epoch": 1.3692307692307693, "grad_norm": 0.7421261668205261, "learning_rate": 6.190964155698903e-06, "loss": 1.1981184482574463, "step": 201 }, { "epoch": 1.376068376068376, "grad_norm": 0.7663130760192871, "learning_rate": 6.154433602223979e-06, "loss": 1.184199333190918, "step": 202 }, { "epoch": 1.3829059829059829, "grad_norm": 0.778105616569519, "learning_rate": 6.117837838764579e-06, "loss": 1.1941637992858887, "step": 203 }, { "epoch": 1.3897435897435897, "grad_norm": 0.7876622676849365, "learning_rate": 6.0811789324895365e-06, "loss": 1.1943039894104004, "step": 204 }, { "epoch": 1.3965811965811965, "grad_norm": 0.7890434861183167, "learning_rate": 6.044458954134411e-06, "loss": 1.1947365999221802, "step": 205 }, { "epoch": 1.4034188034188033, "grad_norm": 0.7558045387268066, "learning_rate": 6.0076799778845105e-06, "loss": 1.1994682550430298, "step": 206 }, { "epoch": 1.4102564102564101, "grad_norm": 0.7472313046455383, "learning_rate": 5.970844081257734e-06, "loss": 1.210819959640503, "step": 207 }, { "epoch": 1.4170940170940172, "grad_norm": 0.7487971782684326, "learning_rate": 5.933953344987215e-06, "loss": 1.1884093284606934, "step": 208 }, { "epoch": 1.423931623931624, "grad_norm": 0.7524631023406982, "learning_rate": 5.897009852903792e-06, "loss": 1.2101268768310547, "step": 209 }, { "epoch": 1.4307692307692308, "grad_norm": 0.7583618760108948, "learning_rate": 5.860015691818292e-06, "loss": 1.214969515800476, "step": 210 }, { "epoch": 1.4376068376068376, "grad_norm": 0.7619627118110657, "learning_rate": 5.82297295140367e-06, "loss": 1.1723865270614624, "step": 211 }, { "epoch": 1.4444444444444444, "grad_norm": 0.782787024974823, "learning_rate": 5.78588372407695e-06, "loss": 1.2125704288482666, "step": 212 }, { "epoch": 1.4512820512820512, "grad_norm": 0.7758169174194336, "learning_rate": 5.748750104881051e-06, "loss": 1.219278335571289, "step": 213 }, { "epoch": 1.458119658119658, "grad_norm": 0.7914722561836243, "learning_rate": 5.711574191366427e-06, "loss": 1.2299978733062744, "step": 214 }, { "epoch": 1.464957264957265, "grad_norm": 0.7562519907951355, "learning_rate": 5.674358083472598e-06, "loss": 1.1945183277130127, "step": 215 }, { "epoch": 1.471794871794872, "grad_norm": 0.7890987396240234, "learning_rate": 5.637103883409525e-06, "loss": 1.228225827217102, "step": 216 }, { "epoch": 1.4786324786324787, "grad_norm": 0.7438657879829407, "learning_rate": 5.599813695538866e-06, "loss": 1.1812902688980103, "step": 217 }, { "epoch": 1.4854700854700855, "grad_norm": 0.7696713805198669, "learning_rate": 5.562489626255104e-06, "loss": 1.2277076244354248, "step": 218 }, { "epoch": 1.4923076923076923, "grad_norm": 0.8019750714302063, "learning_rate": 5.52513378386657e-06, "loss": 1.2309683561325073, "step": 219 }, { "epoch": 1.4991452991452991, "grad_norm": 0.7668002247810364, "learning_rate": 5.487748278476342e-06, "loss": 1.2046821117401123, "step": 220 }, { "epoch": 1.4991452991452991, "eval_loss": 1.3131194114685059, "eval_runtime": 24.7008, "eval_samples_per_second": 39.918, "eval_steps_per_second": 5.02, "step": 220 }, { "epoch": 1.505982905982906, "grad_norm": 0.7732208967208862, "learning_rate": 5.450335221863068e-06, "loss": 1.2219358682632446, "step": 221 }, { "epoch": 1.5128205128205128, "grad_norm": 0.7456432580947876, "learning_rate": 5.412896727361663e-06, "loss": 1.2196807861328125, "step": 222 }, { "epoch": 1.5196581196581196, "grad_norm": 0.7411943674087524, "learning_rate": 5.375434909743942e-06, "loss": 1.2303682565689087, "step": 223 }, { "epoch": 1.5264957264957264, "grad_norm": 0.7763144373893738, "learning_rate": 5.337951885099167e-06, "loss": 1.188888669013977, "step": 224 }, { "epoch": 1.5333333333333332, "grad_norm": 0.8138889074325562, "learning_rate": 5.300449770714502e-06, "loss": 1.1965391635894775, "step": 225 }, { "epoch": 1.54017094017094, "grad_norm": 0.7770660519599915, "learning_rate": 5.262930684955439e-06, "loss": 1.233127474784851, "step": 226 }, { "epoch": 1.547008547008547, "grad_norm": 0.7718791961669922, "learning_rate": 5.225396747146112e-06, "loss": 1.240120768547058, "step": 227 }, { "epoch": 1.5538461538461539, "grad_norm": 0.7710370421409607, "learning_rate": 5.187850077449604e-06, "loss": 1.202008605003357, "step": 228 }, { "epoch": 1.5606837606837607, "grad_norm": 0.7775757908821106, "learning_rate": 5.150292796748174e-06, "loss": 1.2269346714019775, "step": 229 }, { "epoch": 1.5675213675213675, "grad_norm": 0.7479456067085266, "learning_rate": 5.112727026523461e-06, "loss": 1.1906824111938477, "step": 230 }, { "epoch": 1.5743589743589743, "grad_norm": 0.7567362189292908, "learning_rate": 5.075154888736653e-06, "loss": 1.1966190338134766, "step": 231 }, { "epoch": 1.5811965811965814, "grad_norm": 0.7536229491233826, "learning_rate": 5.03757850570861e-06, "loss": 1.1917792558670044, "step": 232 }, { "epoch": 1.5880341880341882, "grad_norm": 0.7776764035224915, "learning_rate": 5e-06, "loss": 1.1941741704940796, "step": 233 }, { "epoch": 1.594871794871795, "grad_norm": 0.7667071223258972, "learning_rate": 4.9624214942913916e-06, "loss": 1.1881437301635742, "step": 234 }, { "epoch": 1.6017094017094018, "grad_norm": 0.773404061794281, "learning_rate": 4.924845111263349e-06, "loss": 1.2190567255020142, "step": 235 }, { "epoch": 1.6085470085470086, "grad_norm": 0.7392263412475586, "learning_rate": 4.88727297347654e-06, "loss": 1.2026817798614502, "step": 236 }, { "epoch": 1.6153846153846154, "grad_norm": 0.7713451981544495, "learning_rate": 4.8497072032518274e-06, "loss": 1.2358677387237549, "step": 237 }, { "epoch": 1.6222222222222222, "grad_norm": 0.7625684142112732, "learning_rate": 4.8121499225503974e-06, "loss": 1.1716538667678833, "step": 238 }, { "epoch": 1.629059829059829, "grad_norm": 0.7581425309181213, "learning_rate": 4.774603252853889e-06, "loss": 1.1988354921340942, "step": 239 }, { "epoch": 1.6358974358974359, "grad_norm": 0.751584529876709, "learning_rate": 4.737069315044562e-06, "loss": 1.2101967334747314, "step": 240 }, { "epoch": 1.6427350427350427, "grad_norm": 0.7554129362106323, "learning_rate": 4.699550229285499e-06, "loss": 1.202675223350525, "step": 241 }, { "epoch": 1.6495726495726495, "grad_norm": 0.761131227016449, "learning_rate": 4.662048114900837e-06, "loss": 1.201820731163025, "step": 242 }, { "epoch": 1.6564102564102563, "grad_norm": 0.7265458703041077, "learning_rate": 4.624565090256059e-06, "loss": 1.2179176807403564, "step": 243 }, { "epoch": 1.6632478632478631, "grad_norm": 0.767880916595459, "learning_rate": 4.587103272638339e-06, "loss": 1.1769942045211792, "step": 244 }, { "epoch": 1.67008547008547, "grad_norm": 0.7633269429206848, "learning_rate": 4.549664778136933e-06, "loss": 1.2298530340194702, "step": 245 }, { "epoch": 1.676923076923077, "grad_norm": 0.7275070548057556, "learning_rate": 4.512251721523659e-06, "loss": 1.2158825397491455, "step": 246 }, { "epoch": 1.6837606837606838, "grad_norm": 0.7592760920524597, "learning_rate": 4.4748662161334335e-06, "loss": 1.207166314125061, "step": 247 }, { "epoch": 1.6905982905982906, "grad_norm": 0.7778440713882446, "learning_rate": 4.437510373744897e-06, "loss": 1.2096598148345947, "step": 248 }, { "epoch": 1.6974358974358974, "grad_norm": 0.7637122869491577, "learning_rate": 4.400186304461136e-06, "loss": 1.1851915121078491, "step": 249 }, { "epoch": 1.7042735042735044, "grad_norm": 0.7784591317176819, "learning_rate": 4.362896116590475e-06, "loss": 1.2293877601623535, "step": 250 }, { "epoch": 1.7111111111111112, "grad_norm": 0.8099437355995178, "learning_rate": 4.325641916527405e-06, "loss": 1.2101249694824219, "step": 251 }, { "epoch": 1.717948717948718, "grad_norm": 0.7552655339241028, "learning_rate": 4.2884258086335755e-06, "loss": 1.2240850925445557, "step": 252 }, { "epoch": 1.7247863247863249, "grad_norm": 0.7730560898780823, "learning_rate": 4.25124989511895e-06, "loss": 1.2249057292938232, "step": 253 }, { "epoch": 1.7316239316239317, "grad_norm": 0.7381757497787476, "learning_rate": 4.214116275923051e-06, "loss": 1.1832340955734253, "step": 254 }, { "epoch": 1.7384615384615385, "grad_norm": 0.739567756652832, "learning_rate": 4.17702704859633e-06, "loss": 1.200039267539978, "step": 255 }, { "epoch": 1.7452991452991453, "grad_norm": 0.774598240852356, "learning_rate": 4.1399843081817085e-06, "loss": 1.2123297452926636, "step": 256 }, { "epoch": 1.7521367521367521, "grad_norm": 0.8052539229393005, "learning_rate": 4.1029901470962105e-06, "loss": 1.2242088317871094, "step": 257 }, { "epoch": 1.758974358974359, "grad_norm": 0.7723326683044434, "learning_rate": 4.066046655012786e-06, "loss": 1.2281506061553955, "step": 258 }, { "epoch": 1.7658119658119658, "grad_norm": 0.7577686309814453, "learning_rate": 4.029155918742268e-06, "loss": 1.2183786630630493, "step": 259 }, { "epoch": 1.7726495726495726, "grad_norm": 0.7814478278160095, "learning_rate": 3.992320022115492e-06, "loss": 1.2138553857803345, "step": 260 }, { "epoch": 1.7794871794871794, "grad_norm": 0.7868865132331848, "learning_rate": 3.955541045865591e-06, "loss": 1.1890326738357544, "step": 261 }, { "epoch": 1.7863247863247862, "grad_norm": 0.7574802041053772, "learning_rate": 3.918821067510464e-06, "loss": 1.1699459552764893, "step": 262 }, { "epoch": 1.793162393162393, "grad_norm": 0.7787984013557434, "learning_rate": 3.882162161235421e-06, "loss": 1.1902029514312744, "step": 263 }, { "epoch": 1.8, "grad_norm": 0.780857264995575, "learning_rate": 3.845566397776022e-06, "loss": 1.1960508823394775, "step": 264 }, { "epoch": 1.8, "eval_loss": 1.308773159980774, "eval_runtime": 24.5858, "eval_samples_per_second": 40.104, "eval_steps_per_second": 5.044, "step": 264 }, { "epoch": 1.8068376068376069, "grad_norm": 0.7353282570838928, "learning_rate": 3.8090358443010993e-06, "loss": 1.2238385677337646, "step": 265 }, { "epoch": 1.8136752136752137, "grad_norm": 0.7844496369361877, "learning_rate": 3.7725725642960047e-06, "loss": 1.2065067291259766, "step": 266 }, { "epoch": 1.8205128205128205, "grad_norm": 0.7792806029319763, "learning_rate": 3.7361786174460414e-06, "loss": 1.1908563375473022, "step": 267 }, { "epoch": 1.8273504273504273, "grad_norm": 0.7404017448425293, "learning_rate": 3.6998560595201188e-06, "loss": 1.2162412405014038, "step": 268 }, { "epoch": 1.8341880341880343, "grad_norm": 0.7953075170516968, "learning_rate": 3.6636069422546363e-06, "loss": 1.2134095430374146, "step": 269 }, { "epoch": 1.8410256410256411, "grad_norm": 0.7584754824638367, "learning_rate": 3.627433313237576e-06, "loss": 1.2177472114562988, "step": 270 }, { "epoch": 1.847863247863248, "grad_norm": 0.7290381789207458, "learning_rate": 3.5913372157928515e-06, "loss": 1.189732551574707, "step": 271 }, { "epoch": 1.8547008547008548, "grad_norm": 0.7861201763153076, "learning_rate": 3.555320688864889e-06, "loss": 1.2073522806167603, "step": 272 }, { "epoch": 1.8615384615384616, "grad_norm": 0.7544710636138916, "learning_rate": 3.519385766903442e-06, "loss": 1.2041759490966797, "step": 273 }, { "epoch": 1.8683760683760684, "grad_norm": 0.7539916038513184, "learning_rate": 3.483534479748688e-06, "loss": 1.2057629823684692, "step": 274 }, { "epoch": 1.8752136752136752, "grad_norm": 0.7374740242958069, "learning_rate": 3.447768852516554e-06, "loss": 1.2203168869018555, "step": 275 }, { "epoch": 1.882051282051282, "grad_norm": 0.7594785690307617, "learning_rate": 3.4120909054843375e-06, "loss": 1.182802438735962, "step": 276 }, { "epoch": 1.8888888888888888, "grad_norm": 0.7542571425437927, "learning_rate": 3.3765026539765832e-06, "loss": 1.2168110609054565, "step": 277 }, { "epoch": 1.8957264957264957, "grad_norm": 0.7577287554740906, "learning_rate": 3.3410061082512422e-06, "loss": 1.2106308937072754, "step": 278 }, { "epoch": 1.9025641025641025, "grad_norm": 0.7561420798301697, "learning_rate": 3.3056032733861188e-06, "loss": 1.20242440700531, "step": 279 }, { "epoch": 1.9094017094017093, "grad_norm": 0.7456007599830627, "learning_rate": 3.2702961491656197e-06, "loss": 1.2251598834991455, "step": 280 }, { "epoch": 1.916239316239316, "grad_norm": 0.790366530418396, "learning_rate": 3.2350867299677802e-06, "loss": 1.2062650918960571, "step": 281 }, { "epoch": 1.9230769230769231, "grad_norm": 0.7317772507667542, "learning_rate": 3.1999770046516198e-06, "loss": 1.1729378700256348, "step": 282 }, { "epoch": 1.92991452991453, "grad_norm": 0.7773919105529785, "learning_rate": 3.164968956444791e-06, "loss": 1.1983883380889893, "step": 283 }, { "epoch": 1.9367521367521368, "grad_norm": 0.7585593461990356, "learning_rate": 3.130064562831553e-06, "loss": 1.2086600065231323, "step": 284 }, { "epoch": 1.9435897435897436, "grad_norm": 0.7703876495361328, "learning_rate": 3.0952657954410792e-06, "loss": 1.2189124822616577, "step": 285 }, { "epoch": 1.9504273504273504, "grad_norm": 0.7693601250648499, "learning_rate": 3.0605746199360755e-06, "loss": 1.210176706314087, "step": 286 }, { "epoch": 1.9572649572649574, "grad_norm": 0.7466776967048645, "learning_rate": 3.0259929959017585e-06, "loss": 1.2027801275253296, "step": 287 }, { "epoch": 1.9641025641025642, "grad_norm": 0.772388219833374, "learning_rate": 2.991522876735154e-06, "loss": 1.2112243175506592, "step": 288 }, { "epoch": 1.970940170940171, "grad_norm": 0.7715580463409424, "learning_rate": 2.95716620953476e-06, "loss": 1.1904889345169067, "step": 289 }, { "epoch": 1.9777777777777779, "grad_norm": 0.7397588491439819, "learning_rate": 2.9229249349905686e-06, "loss": 1.1913639307022095, "step": 290 }, { "epoch": 1.9846153846153847, "grad_norm": 0.7530134916305542, "learning_rate": 2.8888009872744332e-06, "loss": 1.2205219268798828, "step": 291 }, { "epoch": 1.9914529914529915, "grad_norm": 0.7689472436904907, "learning_rate": 2.8547962939308187e-06, "loss": 1.2000938653945923, "step": 292 }, { "epoch": 1.9982905982905983, "grad_norm": 0.7348621487617493, "learning_rate": 2.8209127757679246e-06, "loss": 1.1786831617355347, "step": 293 }, { "epoch": 2.0, "grad_norm": 1.537250280380249, "learning_rate": 2.787152346749173e-06, "loss": 1.1778086423873901, "step": 294 }, { "epoch": 2.006837606837607, "grad_norm": 0.9093112945556641, "learning_rate": 2.7535169138851124e-06, "loss": 1.1308534145355225, "step": 295 }, { "epoch": 2.0136752136752136, "grad_norm": 0.895119845867157, "learning_rate": 2.720008377125682e-06, "loss": 1.1030248403549194, "step": 296 }, { "epoch": 2.0205128205128204, "grad_norm": 0.822189211845398, "learning_rate": 2.686628629252899e-06, "loss": 1.0862432718276978, "step": 297 }, { "epoch": 2.0273504273504273, "grad_norm": 0.839640200138092, "learning_rate": 2.6533795557739407e-06, "loss": 1.0923850536346436, "step": 298 }, { "epoch": 2.034188034188034, "grad_norm": 0.7948157787322998, "learning_rate": 2.6202630348146323e-06, "loss": 1.1080037355422974, "step": 299 }, { "epoch": 2.041025641025641, "grad_norm": 0.7708576321601868, "learning_rate": 2.5872809370133704e-06, "loss": 1.133652687072754, "step": 300 }, { "epoch": 2.0478632478632477, "grad_norm": 0.784568727016449, "learning_rate": 2.5544351254154407e-06, "loss": 1.1596778631210327, "step": 301 }, { "epoch": 2.0547008547008545, "grad_norm": 0.8119481205940247, "learning_rate": 2.5217274553677975e-06, "loss": 1.129364252090454, "step": 302 }, { "epoch": 2.0615384615384613, "grad_norm": 0.7969528436660767, "learning_rate": 2.489159774414252e-06, "loss": 1.0949797630310059, "step": 303 }, { "epoch": 2.0683760683760686, "grad_norm": 0.823360800743103, "learning_rate": 2.4567339221911086e-06, "loss": 1.1301119327545166, "step": 304 }, { "epoch": 2.0752136752136754, "grad_norm": 0.8292282223701477, "learning_rate": 2.424451730323261e-06, "loss": 1.1120922565460205, "step": 305 }, { "epoch": 2.082051282051282, "grad_norm": 0.8004986047744751, "learning_rate": 2.3923150223207176e-06, "loss": 1.1214550733566284, "step": 306 }, { "epoch": 2.088888888888889, "grad_norm": 0.8165397644042969, "learning_rate": 2.3603256134756066e-06, "loss": 1.1209532022476196, "step": 307 }, { "epoch": 2.095726495726496, "grad_norm": 0.8034455180168152, "learning_rate": 2.328485310759635e-06, "loss": 1.1401094198226929, "step": 308 }, { "epoch": 2.095726495726496, "eval_loss": 1.3253560066223145, "eval_runtime": 24.6122, "eval_samples_per_second": 40.061, "eval_steps_per_second": 5.038, "step": 308 }, { "epoch": 2.1025641025641026, "grad_norm": 0.7844864130020142, "learning_rate": 2.296795912722014e-06, "loss": 1.144791603088379, "step": 309 }, { "epoch": 2.1094017094017095, "grad_norm": 0.7857894897460938, "learning_rate": 2.265259209387867e-06, "loss": 1.1488922834396362, "step": 310 }, { "epoch": 2.1162393162393163, "grad_norm": 0.7851693630218506, "learning_rate": 2.2338769821571225e-06, "loss": 1.1399354934692383, "step": 311 }, { "epoch": 2.123076923076923, "grad_norm": 0.8227202296257019, "learning_rate": 2.202651003703885e-06, "loss": 1.1063587665557861, "step": 312 }, { "epoch": 2.12991452991453, "grad_norm": 0.822938084602356, "learning_rate": 2.1715830378763025e-06, "loss": 1.1050540208816528, "step": 313 }, { "epoch": 2.1367521367521367, "grad_norm": 0.8058551549911499, "learning_rate": 2.140674839596931e-06, "loss": 1.0922585725784302, "step": 314 }, { "epoch": 2.1435897435897435, "grad_norm": 0.7917458415031433, "learning_rate": 2.109928154763606e-06, "loss": 1.1247828006744385, "step": 315 }, { "epoch": 2.1504273504273503, "grad_norm": 0.8290326595306396, "learning_rate": 2.0793447201508288e-06, "loss": 1.1369386911392212, "step": 316 }, { "epoch": 2.157264957264957, "grad_norm": 0.7832273840904236, "learning_rate": 2.0489262633116536e-06, "loss": 1.110697627067566, "step": 317 }, { "epoch": 2.164102564102564, "grad_norm": 0.7919285297393799, "learning_rate": 2.01867450248011e-06, "loss": 1.157274842262268, "step": 318 }, { "epoch": 2.1709401709401708, "grad_norm": 0.7776212096214294, "learning_rate": 1.9885911464741413e-06, "loss": 1.139618992805481, "step": 319 }, { "epoch": 2.1777777777777776, "grad_norm": 0.7800706624984741, "learning_rate": 1.9586778945990785e-06, "loss": 1.1110671758651733, "step": 320 }, { "epoch": 2.184615384615385, "grad_norm": 0.8117327094078064, "learning_rate": 1.928936436551661e-06, "loss": 1.1395684480667114, "step": 321 }, { "epoch": 2.1914529914529917, "grad_norm": 0.7962910532951355, "learning_rate": 1.8993684523245842e-06, "loss": 1.1162846088409424, "step": 322 }, { "epoch": 2.1982905982905985, "grad_norm": 0.7874794602394104, "learning_rate": 1.8699756121115997e-06, "loss": 1.1188956499099731, "step": 323 }, { "epoch": 2.2051282051282053, "grad_norm": 0.785068690776825, "learning_rate": 1.8407595762131814e-06, "loss": 1.1131058931350708, "step": 324 }, { "epoch": 2.211965811965812, "grad_norm": 0.8046601414680481, "learning_rate": 1.811721994942731e-06, "loss": 1.1231977939605713, "step": 325 }, { "epoch": 2.218803418803419, "grad_norm": 0.759477972984314, "learning_rate": 1.7828645085333645e-06, "loss": 1.1036738157272339, "step": 326 }, { "epoch": 2.2256410256410257, "grad_norm": 0.7955328226089478, "learning_rate": 1.7541887470452606e-06, "loss": 1.166395664215088, "step": 327 }, { "epoch": 2.2324786324786325, "grad_norm": 0.7807881236076355, "learning_rate": 1.7256963302735752e-06, "loss": 1.1385221481323242, "step": 328 }, { "epoch": 2.2393162393162394, "grad_norm": 0.7881447076797485, "learning_rate": 1.6973888676569594e-06, "loss": 1.145586609840393, "step": 329 }, { "epoch": 2.246153846153846, "grad_norm": 0.8092402815818787, "learning_rate": 1.6692679581866334e-06, "loss": 1.1422295570373535, "step": 330 }, { "epoch": 2.252991452991453, "grad_norm": 0.7870088219642639, "learning_rate": 1.6413351903160763e-06, "loss": 1.1302958726882935, "step": 331 }, { "epoch": 2.25982905982906, "grad_norm": 0.8018279075622559, "learning_rate": 1.6135921418712959e-06, "loss": 1.114201545715332, "step": 332 }, { "epoch": 2.2666666666666666, "grad_norm": 0.7955658435821533, "learning_rate": 1.5860403799616951e-06, "loss": 1.1686758995056152, "step": 333 }, { "epoch": 2.2735042735042734, "grad_norm": 0.8098942637443542, "learning_rate": 1.5586814608915673e-06, "loss": 1.1103954315185547, "step": 334 }, { "epoch": 2.2803418803418802, "grad_norm": 0.7653470039367676, "learning_rate": 1.5315169300721694e-06, "loss": 1.1263670921325684, "step": 335 }, { "epoch": 2.287179487179487, "grad_norm": 0.7954714894294739, "learning_rate": 1.5045483219344387e-06, "loss": 1.091448187828064, "step": 336 }, { "epoch": 2.294017094017094, "grad_norm": 0.7870411276817322, "learning_rate": 1.4777771598423147e-06, "loss": 1.127175211906433, "step": 337 }, { "epoch": 2.3008547008547007, "grad_norm": 0.8070060014724731, "learning_rate": 1.4512049560066837e-06, "loss": 1.1385235786437988, "step": 338 }, { "epoch": 2.3076923076923075, "grad_norm": 0.7654244303703308, "learning_rate": 1.4248332113999708e-06, "loss": 1.1272555589675903, "step": 339 }, { "epoch": 2.3145299145299143, "grad_norm": 0.7763322591781616, "learning_rate": 1.3986634156713418e-06, "loss": 1.1271766424179077, "step": 340 }, { "epoch": 2.3213675213675216, "grad_norm": 0.7544705867767334, "learning_rate": 1.3726970470625705e-06, "loss": 1.157515525817871, "step": 341 }, { "epoch": 2.3282051282051284, "grad_norm": 0.7676778435707092, "learning_rate": 1.3469355723245303e-06, "loss": 1.1277141571044922, "step": 342 }, { "epoch": 2.335042735042735, "grad_norm": 0.7713337540626526, "learning_rate": 1.321380446634342e-06, "loss": 1.1003583669662476, "step": 343 }, { "epoch": 2.341880341880342, "grad_norm": 0.7740820646286011, "learning_rate": 1.2960331135131826e-06, "loss": 1.1071029901504517, "step": 344 }, { "epoch": 2.348717948717949, "grad_norm": 0.758073091506958, "learning_rate": 1.270895004744737e-06, "loss": 1.110722303390503, "step": 345 }, { "epoch": 2.3555555555555556, "grad_norm": 0.7693141102790833, "learning_rate": 1.245967540294329e-06, "loss": 1.097144365310669, "step": 346 }, { "epoch": 2.3623931623931624, "grad_norm": 0.7613301873207092, "learning_rate": 1.2212521282287093e-06, "loss": 1.130142092704773, "step": 347 }, { "epoch": 2.3692307692307693, "grad_norm": 0.7610928416252136, "learning_rate": 1.1967501646365147e-06, "loss": 1.1337437629699707, "step": 348 }, { "epoch": 2.376068376068376, "grad_norm": 0.7692887187004089, "learning_rate": 1.172463033549418e-06, "loss": 1.1064190864562988, "step": 349 }, { "epoch": 2.382905982905983, "grad_norm": 0.7826989889144897, "learning_rate": 1.1483921068639353e-06, "loss": 1.1885005235671997, "step": 350 }, { "epoch": 2.3897435897435897, "grad_norm": 0.7613060474395752, "learning_rate": 1.1245387442639456e-06, "loss": 1.110337734222412, "step": 351 }, { "epoch": 2.3965811965811965, "grad_norm": 0.7910706400871277, "learning_rate": 1.1009042931438784e-06, "loss": 1.1144278049468994, "step": 352 }, { "epoch": 2.3965811965811965, "eval_loss": 1.323965311050415, "eval_runtime": 24.7109, "eval_samples_per_second": 39.901, "eval_steps_per_second": 5.018, "step": 352 }, { "epoch": 2.4034188034188033, "grad_norm": 0.7570564150810242, "learning_rate": 1.077490088532605e-06, "loss": 1.114471435546875, "step": 353 }, { "epoch": 2.41025641025641, "grad_norm": 0.7983273863792419, "learning_rate": 1.0542974530180327e-06, "loss": 1.132286787033081, "step": 354 }, { "epoch": 2.417094017094017, "grad_norm": 0.7606459856033325, "learning_rate": 1.0313276966723867e-06, "loss": 1.0865505933761597, "step": 355 }, { "epoch": 2.4239316239316238, "grad_norm": 0.7879711389541626, "learning_rate": 1.00858211697822e-06, "loss": 1.1440324783325195, "step": 356 }, { "epoch": 2.430769230769231, "grad_norm": 0.762718915939331, "learning_rate": 9.860619987551157e-07, "loss": 1.1018445491790771, "step": 357 }, { "epoch": 2.437606837606838, "grad_norm": 0.7899941802024841, "learning_rate": 9.637686140871121e-07, "loss": 1.1469783782958984, "step": 358 }, { "epoch": 2.4444444444444446, "grad_norm": 0.7909042239189148, "learning_rate": 9.417032222508476e-07, "loss": 1.1333407163619995, "step": 359 }, { "epoch": 2.4512820512820515, "grad_norm": 0.7936816811561584, "learning_rate": 9.198670696444339e-07, "loss": 1.1438573598861694, "step": 360 }, { "epoch": 2.4581196581196583, "grad_norm": 0.7882561683654785, "learning_rate": 8.982613897170439e-07, "loss": 1.1176822185516357, "step": 361 }, { "epoch": 2.464957264957265, "grad_norm": 0.7810674905776978, "learning_rate": 8.768874028992431e-07, "loss": 1.135961651802063, "step": 362 }, { "epoch": 2.471794871794872, "grad_norm": 0.7794176340103149, "learning_rate": 8.557463165340479e-07, "loss": 1.1315698623657227, "step": 363 }, { "epoch": 2.4786324786324787, "grad_norm": 0.7674309611320496, "learning_rate": 8.348393248087289e-07, "loss": 1.1471264362335205, "step": 364 }, { "epoch": 2.4854700854700855, "grad_norm": 0.7684411406517029, "learning_rate": 8.141676086873574e-07, "loss": 1.1023811101913452, "step": 365 }, { "epoch": 2.4923076923076923, "grad_norm": 0.7729819416999817, "learning_rate": 7.937323358440935e-07, "loss": 1.1146825551986694, "step": 366 }, { "epoch": 2.499145299145299, "grad_norm": 0.7710589170455933, "learning_rate": 7.735346605972322e-07, "loss": 1.1076273918151855, "step": 367 }, { "epoch": 2.505982905982906, "grad_norm": 0.7700541019439697, "learning_rate": 7.535757238439939e-07, "loss": 1.1303023099899292, "step": 368 }, { "epoch": 2.5128205128205128, "grad_norm": 0.7796255946159363, "learning_rate": 7.338566529960817e-07, "loss": 1.1434168815612793, "step": 369 }, { "epoch": 2.5196581196581196, "grad_norm": 0.7890748977661133, "learning_rate": 7.143785619160026e-07, "loss": 1.137059211730957, "step": 370 }, { "epoch": 2.5264957264957264, "grad_norm": 0.7733116149902344, "learning_rate": 6.951425508541432e-07, "loss": 1.1050790548324585, "step": 371 }, { "epoch": 2.533333333333333, "grad_norm": 0.7718008160591125, "learning_rate": 6.761497063866207e-07, "loss": 1.1239290237426758, "step": 372 }, { "epoch": 2.54017094017094, "grad_norm": 0.7675129771232605, "learning_rate": 6.574011013539111e-07, "loss": 1.1362709999084473, "step": 373 }, { "epoch": 2.547008547008547, "grad_norm": 0.7831134796142578, "learning_rate": 6.388977948002406e-07, "loss": 1.1359511613845825, "step": 374 }, { "epoch": 2.5538461538461537, "grad_norm": 0.7688263654708862, "learning_rate": 6.206408319137703e-07, "loss": 1.1311153173446655, "step": 375 }, { "epoch": 2.5606837606837605, "grad_norm": 0.7608706951141357, "learning_rate": 6.026312439675553e-07, "loss": 1.1158239841461182, "step": 376 }, { "epoch": 2.5675213675213673, "grad_norm": 0.7655665278434753, "learning_rate": 5.848700482612873e-07, "loss": 1.1498501300811768, "step": 377 }, { "epoch": 2.574358974358974, "grad_norm": 0.7795934081077576, "learning_rate": 5.673582480638395e-07, "loss": 1.1341049671173096, "step": 378 }, { "epoch": 2.5811965811965814, "grad_norm": 0.7773811221122742, "learning_rate": 5.500968325565859e-07, "loss": 1.1404979228973389, "step": 379 }, { "epoch": 2.588034188034188, "grad_norm": 0.8611118793487549, "learning_rate": 5.330867767775333e-07, "loss": 1.0921636819839478, "step": 380 }, { "epoch": 2.594871794871795, "grad_norm": 0.745428204536438, "learning_rate": 5.163290415662408e-07, "loss": 1.1557259559631348, "step": 381 }, { "epoch": 2.601709401709402, "grad_norm": 0.7756429314613342, "learning_rate": 4.998245735095459e-07, "loss": 1.1447691917419434, "step": 382 }, { "epoch": 2.6085470085470086, "grad_norm": 0.7908133864402771, "learning_rate": 4.835743048880959e-07, "loss": 1.143109917640686, "step": 383 }, { "epoch": 2.6153846153846154, "grad_norm": 0.7732424736022949, "learning_rate": 4.6757915362368567e-07, "loss": 1.132035493850708, "step": 384 }, { "epoch": 2.6222222222222222, "grad_norm": 0.7889422178268433, "learning_rate": 4.5184002322740784e-07, "loss": 1.1180846691131592, "step": 385 }, { "epoch": 2.629059829059829, "grad_norm": 0.7938551902770996, "learning_rate": 4.363578027486187e-07, "loss": 1.1456289291381836, "step": 386 }, { "epoch": 2.635897435897436, "grad_norm": 0.8030667901039124, "learning_rate": 4.211333667247125e-07, "loss": 1.1397569179534912, "step": 387 }, { "epoch": 2.6427350427350427, "grad_norm": 0.7819530367851257, "learning_rate": 4.0616757513173123e-07, "loss": 1.1004501581192017, "step": 388 }, { "epoch": 2.6495726495726495, "grad_norm": 0.758314311504364, "learning_rate": 3.9146127333577757e-07, "loss": 1.1101858615875244, "step": 389 }, { "epoch": 2.6564102564102563, "grad_norm": 0.7801131010055542, "learning_rate": 3.7701529204526856e-07, "loss": 1.1453076601028442, "step": 390 }, { "epoch": 2.663247863247863, "grad_norm": 0.7489244937896729, "learning_rate": 3.6283044726401594e-07, "loss": 1.0911612510681152, "step": 391 }, { "epoch": 2.67008547008547, "grad_norm": 0.761225700378418, "learning_rate": 3.4890754024512254e-07, "loss": 1.130741000175476, "step": 392 }, { "epoch": 2.676923076923077, "grad_norm": 0.761887788772583, "learning_rate": 3.352473574457304e-07, "loss": 1.120837926864624, "step": 393 }, { "epoch": 2.683760683760684, "grad_norm": 0.7792303562164307, "learning_rate": 3.2185067048259245e-07, "loss": 1.1177864074707031, "step": 394 }, { "epoch": 2.690598290598291, "grad_norm": 0.7689954042434692, "learning_rate": 3.087182360884872e-07, "loss": 1.177292823791504, "step": 395 }, { "epoch": 2.6974358974358976, "grad_norm": 0.7710866332054138, "learning_rate": 2.9585079606947843e-07, "loss": 1.1195672750473022, "step": 396 }, { "epoch": 2.6974358974358976, "eval_loss": 1.3236175775527954, "eval_runtime": 24.7082, "eval_samples_per_second": 39.906, "eval_steps_per_second": 5.019, "step": 396 }, { "epoch": 2.7042735042735044, "grad_norm": 0.7776737809181213, "learning_rate": 2.8324907726300366e-07, "loss": 1.113619327545166, "step": 397 }, { "epoch": 2.7111111111111112, "grad_norm": 0.7743112444877625, "learning_rate": 2.7091379149682683e-07, "loss": 1.0938081741333008, "step": 398 }, { "epoch": 2.717948717948718, "grad_norm": 0.7779694199562073, "learning_rate": 2.5884563554882336e-07, "loss": 1.1138122081756592, "step": 399 }, { "epoch": 2.724786324786325, "grad_norm": 0.7622742652893066, "learning_rate": 2.470452911076227e-07, "loss": 1.1006677150726318, "step": 400 }, { "epoch": 2.7316239316239317, "grad_norm": 0.7664272785186768, "learning_rate": 2.355134247341073e-07, "loss": 1.1065200567245483, "step": 401 }, { "epoch": 2.7384615384615385, "grad_norm": 0.7712447643280029, "learning_rate": 2.242506878237538e-07, "loss": 1.1020417213439941, "step": 402 }, { "epoch": 2.7452991452991453, "grad_norm": 0.7656382322311401, "learning_rate": 2.1325771656984075e-07, "loss": 1.1001569032669067, "step": 403 }, { "epoch": 2.752136752136752, "grad_norm": 0.7811654806137085, "learning_rate": 2.0253513192751374e-07, "loss": 1.1310510635375977, "step": 404 }, { "epoch": 2.758974358974359, "grad_norm": 0.7687283158302307, "learning_rate": 1.9208353957870684e-07, "loss": 1.146543264389038, "step": 405 }, { "epoch": 2.7658119658119658, "grad_norm": 0.7670867443084717, "learning_rate": 1.8190352989793325e-07, "loss": 1.1161731481552124, "step": 406 }, { "epoch": 2.7726495726495726, "grad_norm": 0.7807978391647339, "learning_rate": 1.7199567791893524e-07, "loss": 1.1282137632369995, "step": 407 }, { "epoch": 2.7794871794871794, "grad_norm": 0.7957569360733032, "learning_rate": 1.6236054330219853e-07, "loss": 1.1041632890701294, "step": 408 }, { "epoch": 2.786324786324786, "grad_norm": 0.7832216024398804, "learning_rate": 1.5299867030334815e-07, "loss": 1.108730435371399, "step": 409 }, { "epoch": 2.793162393162393, "grad_norm": 0.753606915473938, "learning_rate": 1.439105877423963e-07, "loss": 1.131809115409851, "step": 410 }, { "epoch": 2.8, "grad_norm": 0.7802961468696594, "learning_rate": 1.350968089738758e-07, "loss": 1.1083602905273438, "step": 411 }, { "epoch": 2.8068376068376066, "grad_norm": 0.768670380115509, "learning_rate": 1.2655783185784253e-07, "loss": 1.1080389022827148, "step": 412 }, { "epoch": 2.8136752136752134, "grad_norm": 0.7562652230262756, "learning_rate": 1.1829413873174988e-07, "loss": 1.1086317300796509, "step": 413 }, { "epoch": 2.8205128205128203, "grad_norm": 0.763107180595398, "learning_rate": 1.1030619638320805e-07, "loss": 1.1433099508285522, "step": 414 }, { "epoch": 2.827350427350427, "grad_norm": 0.7749531865119934, "learning_rate": 1.0259445602361084e-07, "loss": 1.129563331604004, "step": 415 }, { "epoch": 2.8341880341880343, "grad_norm": 0.7604458928108215, "learning_rate": 9.51593532626538e-08, "loss": 1.120940089225769, "step": 416 }, { "epoch": 2.841025641025641, "grad_norm": 0.750518262386322, "learning_rate": 8.800130808372553e-08, "loss": 1.0916835069656372, "step": 417 }, { "epoch": 2.847863247863248, "grad_norm": 0.7595433592796326, "learning_rate": 8.11207248201834e-08, "loss": 1.1178152561187744, "step": 418 }, { "epoch": 2.8547008547008548, "grad_norm": 0.7640005350112915, "learning_rate": 7.45179921325162e-08, "loss": 1.1630092859268188, "step": 419 }, { "epoch": 2.8615384615384616, "grad_norm": 0.8447228074073792, "learning_rate": 6.819348298638839e-08, "loss": 1.1273298263549805, "step": 420 }, { "epoch": 2.8683760683760684, "grad_norm": 0.7577494978904724, "learning_rate": 6.214755463157417e-08, "loss": 1.0993590354919434, "step": 421 }, { "epoch": 2.875213675213675, "grad_norm": 0.7751004099845886, "learning_rate": 5.638054858177644e-08, "loss": 1.1498969793319702, "step": 422 }, { "epoch": 2.882051282051282, "grad_norm": 0.7662968039512634, "learning_rate": 5.089279059533658e-08, "loss": 1.1176806688308716, "step": 423 }, { "epoch": 2.888888888888889, "grad_norm": 0.7827076315879822, "learning_rate": 4.568459065683206e-08, "loss": 1.1449580192565918, "step": 424 }, { "epoch": 2.8957264957264957, "grad_norm": 0.7646909952163696, "learning_rate": 4.0756242959567596e-08, "loss": 1.1186950206756592, "step": 425 }, { "epoch": 2.9025641025641025, "grad_norm": 0.7541195154190063, "learning_rate": 3.610802588895845e-08, "loss": 1.131952166557312, "step": 426 }, { "epoch": 2.9094017094017093, "grad_norm": 0.7776208519935608, "learning_rate": 3.1740202006804166e-08, "loss": 1.1178792715072632, "step": 427 }, { "epoch": 2.916239316239316, "grad_norm": 0.7766209244728088, "learning_rate": 2.765301803645426e-08, "loss": 1.1331486701965332, "step": 428 }, { "epoch": 2.9230769230769234, "grad_norm": 0.7666369676589966, "learning_rate": 2.3846704848878298e-08, "loss": 1.1589261293411255, "step": 429 }, { "epoch": 2.92991452991453, "grad_norm": 0.7775545716285706, "learning_rate": 2.0321477449619098e-08, "loss": 1.1344677209854126, "step": 430 }, { "epoch": 2.936752136752137, "grad_norm": 0.7537861466407776, "learning_rate": 1.7077534966650767e-08, "loss": 1.1040513515472412, "step": 431 }, { "epoch": 2.943589743589744, "grad_norm": 0.7825785875320435, "learning_rate": 1.411506063912882e-08, "loss": 1.1581734418869019, "step": 432 }, { "epoch": 2.9504273504273506, "grad_norm": 0.7491230368614197, "learning_rate": 1.1434221807041234e-08, "loss": 1.1113041639328003, "step": 433 }, { "epoch": 2.9572649572649574, "grad_norm": 0.7601305842399597, "learning_rate": 9.035169901754902e-09, "loss": 1.0998278856277466, "step": 434 }, { "epoch": 2.9641025641025642, "grad_norm": 0.7869414687156677, "learning_rate": 6.918040437463025e-09, "loss": 1.1475398540496826, "step": 435 }, { "epoch": 2.970940170940171, "grad_norm": 0.760128915309906, "learning_rate": 5.082953003528457e-09, "loss": 1.1517993211746216, "step": 436 }, { "epoch": 2.977777777777778, "grad_norm": 0.7626367211341858, "learning_rate": 3.530011257730226e-09, "loss": 1.1134616136550903, "step": 437 }, { "epoch": 2.9846153846153847, "grad_norm": 0.765670657157898, "learning_rate": 2.2593029204076578e-09, "loss": 1.1342540979385376, "step": 438 }, { "epoch": 2.9914529914529915, "grad_norm": 0.7739811539649963, "learning_rate": 1.2708997695043412e-09, "loss": 1.1077520847320557, "step": 439 }, { "epoch": 2.9982905982905983, "grad_norm": 0.7707903385162354, "learning_rate": 5.648576365169245e-10, "loss": 1.0939933061599731, "step": 440 }, { "epoch": 2.9982905982905983, "eval_loss": 1.3233778476715088, "eval_runtime": 24.6851, "eval_samples_per_second": 39.943, "eval_steps_per_second": 5.023, "step": 440 }, { "epoch": 3.0, "grad_norm": 1.5993366241455078, "learning_rate": 1.4121640333653042e-10, "loss": 1.0642163753509521, "step": 441 }, { "epoch": 3.0, "step": 441, "total_flos": 9.743300044908134e+17, "train_loss": 1.2459275746832088, "train_runtime": 6646.3979, "train_samples_per_second": 8.449, "train_steps_per_second": 0.066 } ], "logging_steps": 1.0, "max_steps": 441, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 44, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.743300044908134e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }