{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.993968636911942, "eval_steps": 500, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003216726980297547, "grad_norm": 81.91886138916016, "learning_rate": 1.6129032258064518e-07, "loss": 0.3558, "step": 1 }, { "epoch": 0.006433453960595094, "grad_norm": 88.99775695800781, "learning_rate": 3.2258064516129035e-07, "loss": 0.4071, "step": 2 }, { "epoch": 0.009650180940892641, "grad_norm": 79.32195281982422, "learning_rate": 4.838709677419355e-07, "loss": 0.3576, "step": 3 }, { "epoch": 0.012866907921190189, "grad_norm": 81.10282897949219, "learning_rate": 6.451612903225807e-07, "loss": 0.3627, "step": 4 }, { "epoch": 0.016083634901487735, "grad_norm": 75.47724914550781, "learning_rate": 8.064516129032258e-07, "loss": 0.3581, "step": 5 }, { "epoch": 0.019300361881785282, "grad_norm": 73.94715118408203, "learning_rate": 9.67741935483871e-07, "loss": 0.3392, "step": 6 }, { "epoch": 0.02251708886208283, "grad_norm": 54.852840423583984, "learning_rate": 1.1290322580645162e-06, "loss": 0.2786, "step": 7 }, { "epoch": 0.025733815842380377, "grad_norm": 51.52576446533203, "learning_rate": 1.2903225806451614e-06, "loss": 0.27, "step": 8 }, { "epoch": 0.028950542822677925, "grad_norm": 17.170745849609375, "learning_rate": 1.4516129032258066e-06, "loss": 0.1758, "step": 9 }, { "epoch": 0.03216726980297547, "grad_norm": 10.435275077819824, "learning_rate": 1.6129032258064516e-06, "loss": 0.1723, "step": 10 }, { "epoch": 0.03538399678327302, "grad_norm": 8.09239387512207, "learning_rate": 1.774193548387097e-06, "loss": 0.1565, "step": 11 }, { "epoch": 0.038600723763570564, "grad_norm": 41.39366149902344, "learning_rate": 1.935483870967742e-06, "loss": 0.1746, "step": 12 }, { "epoch": 0.04181745074386811, "grad_norm": 20.37698745727539, "learning_rate": 2.096774193548387e-06, "loss": 0.1709, "step": 13 }, { "epoch": 0.04503417772416566, "grad_norm": 10.368378639221191, "learning_rate": 2.2580645161290324e-06, "loss": 0.1567, "step": 14 }, { "epoch": 0.04825090470446321, "grad_norm": 14.423890113830566, "learning_rate": 2.4193548387096776e-06, "loss": 0.1344, "step": 15 }, { "epoch": 0.051467631684760755, "grad_norm": 13.449414253234863, "learning_rate": 2.580645161290323e-06, "loss": 0.1306, "step": 16 }, { "epoch": 0.0546843586650583, "grad_norm": 5.699060916900635, "learning_rate": 2.7419354838709676e-06, "loss": 0.1368, "step": 17 }, { "epoch": 0.05790108564535585, "grad_norm": 38.163658142089844, "learning_rate": 2.903225806451613e-06, "loss": 0.1767, "step": 18 }, { "epoch": 0.0611178126256534, "grad_norm": 43.165924072265625, "learning_rate": 3.0645161290322584e-06, "loss": 0.2103, "step": 19 }, { "epoch": 0.06433453960595094, "grad_norm": 34.02187728881836, "learning_rate": 3.225806451612903e-06, "loss": 0.1798, "step": 20 }, { "epoch": 0.06755126658624849, "grad_norm": 15.196747779846191, "learning_rate": 3.3870967741935484e-06, "loss": 0.1326, "step": 21 }, { "epoch": 0.07076799356654603, "grad_norm": 12.960094451904297, "learning_rate": 3.548387096774194e-06, "loss": 0.132, "step": 22 }, { "epoch": 0.07398472054684359, "grad_norm": 18.067081451416016, "learning_rate": 3.7096774193548392e-06, "loss": 0.1407, "step": 23 }, { "epoch": 0.07720144752714113, "grad_norm": 11.63009262084961, "learning_rate": 3.870967741935484e-06, "loss": 0.1198, "step": 24 }, { "epoch": 0.08041817450743868, "grad_norm": 3.6088430881500244, "learning_rate": 4.032258064516129e-06, "loss": 0.0861, "step": 25 }, { "epoch": 0.08363490148773622, "grad_norm": 10.469372749328613, "learning_rate": 4.193548387096774e-06, "loss": 0.1043, "step": 26 }, { "epoch": 0.08685162846803378, "grad_norm": 14.186717987060547, "learning_rate": 4.35483870967742e-06, "loss": 0.135, "step": 27 }, { "epoch": 0.09006835544833132, "grad_norm": 7.1297454833984375, "learning_rate": 4.516129032258065e-06, "loss": 0.1171, "step": 28 }, { "epoch": 0.09328508242862887, "grad_norm": 2.476020574569702, "learning_rate": 4.67741935483871e-06, "loss": 0.089, "step": 29 }, { "epoch": 0.09650180940892641, "grad_norm": 3.7916111946105957, "learning_rate": 4.838709677419355e-06, "loss": 0.0881, "step": 30 }, { "epoch": 0.09971853638922397, "grad_norm": 1.1642123460769653, "learning_rate": 5e-06, "loss": 0.0965, "step": 31 }, { "epoch": 0.10293526336952151, "grad_norm": 3.4518535137176514, "learning_rate": 5.161290322580646e-06, "loss": 0.0849, "step": 32 }, { "epoch": 0.10615199034981906, "grad_norm": 2.2884507179260254, "learning_rate": 5.322580645161291e-06, "loss": 0.0786, "step": 33 }, { "epoch": 0.1093687173301166, "grad_norm": 1.3953402042388916, "learning_rate": 5.483870967741935e-06, "loss": 0.0675, "step": 34 }, { "epoch": 0.11258544431041416, "grad_norm": 5.182158470153809, "learning_rate": 5.645161290322582e-06, "loss": 0.0977, "step": 35 }, { "epoch": 0.1158021712907117, "grad_norm": 2.713275909423828, "learning_rate": 5.806451612903226e-06, "loss": 0.0648, "step": 36 }, { "epoch": 0.11901889827100925, "grad_norm": 2.827744960784912, "learning_rate": 5.967741935483872e-06, "loss": 0.082, "step": 37 }, { "epoch": 0.1222356252513068, "grad_norm": 4.1014723777771, "learning_rate": 6.129032258064517e-06, "loss": 0.0758, "step": 38 }, { "epoch": 0.12545235223160434, "grad_norm": 1.5448578596115112, "learning_rate": 6.290322580645162e-06, "loss": 0.066, "step": 39 }, { "epoch": 0.12866907921190188, "grad_norm": 3.3927855491638184, "learning_rate": 6.451612903225806e-06, "loss": 0.0709, "step": 40 }, { "epoch": 0.13188580619219945, "grad_norm": 2.370821475982666, "learning_rate": 6.612903225806452e-06, "loss": 0.0852, "step": 41 }, { "epoch": 0.13510253317249699, "grad_norm": 3.0705552101135254, "learning_rate": 6.774193548387097e-06, "loss": 0.0702, "step": 42 }, { "epoch": 0.13831926015279453, "grad_norm": 4.215071678161621, "learning_rate": 6.935483870967743e-06, "loss": 0.0747, "step": 43 }, { "epoch": 0.14153598713309207, "grad_norm": 2.137068748474121, "learning_rate": 7.096774193548388e-06, "loss": 0.0657, "step": 44 }, { "epoch": 0.14475271411338964, "grad_norm": 4.441213130950928, "learning_rate": 7.258064516129033e-06, "loss": 0.0717, "step": 45 }, { "epoch": 0.14796944109368718, "grad_norm": 2.896615743637085, "learning_rate": 7.4193548387096784e-06, "loss": 0.066, "step": 46 }, { "epoch": 0.15118616807398472, "grad_norm": 1.594978928565979, "learning_rate": 7.580645161290323e-06, "loss": 0.0554, "step": 47 }, { "epoch": 0.15440289505428226, "grad_norm": 3.3716137409210205, "learning_rate": 7.741935483870968e-06, "loss": 0.075, "step": 48 }, { "epoch": 0.15761962203457983, "grad_norm": 2.1819281578063965, "learning_rate": 7.903225806451613e-06, "loss": 0.0583, "step": 49 }, { "epoch": 0.16083634901487737, "grad_norm": 5.265512943267822, "learning_rate": 8.064516129032258e-06, "loss": 0.0832, "step": 50 }, { "epoch": 0.1640530759951749, "grad_norm": 6.982624053955078, "learning_rate": 8.225806451612904e-06, "loss": 0.0772, "step": 51 }, { "epoch": 0.16726980297547245, "grad_norm": 4.082429885864258, "learning_rate": 8.387096774193549e-06, "loss": 0.0734, "step": 52 }, { "epoch": 0.17048652995577002, "grad_norm": 1.164082646369934, "learning_rate": 8.548387096774194e-06, "loss": 0.0541, "step": 53 }, { "epoch": 0.17370325693606756, "grad_norm": 2.810659646987915, "learning_rate": 8.70967741935484e-06, "loss": 0.052, "step": 54 }, { "epoch": 0.1769199839163651, "grad_norm": 1.5780754089355469, "learning_rate": 8.870967741935484e-06, "loss": 0.0582, "step": 55 }, { "epoch": 0.18013671089666264, "grad_norm": 1.068620204925537, "learning_rate": 9.03225806451613e-06, "loss": 0.065, "step": 56 }, { "epoch": 0.18335343787696018, "grad_norm": 2.523627758026123, "learning_rate": 9.193548387096775e-06, "loss": 0.0624, "step": 57 }, { "epoch": 0.18657016485725775, "grad_norm": 1.244511365890503, "learning_rate": 9.35483870967742e-06, "loss": 0.0503, "step": 58 }, { "epoch": 0.1897868918375553, "grad_norm": 0.9401457905769348, "learning_rate": 9.516129032258065e-06, "loss": 0.051, "step": 59 }, { "epoch": 0.19300361881785283, "grad_norm": 0.9283802509307861, "learning_rate": 9.67741935483871e-06, "loss": 0.0537, "step": 60 }, { "epoch": 0.19622034579815037, "grad_norm": 1.3737329244613647, "learning_rate": 9.838709677419356e-06, "loss": 0.0528, "step": 61 }, { "epoch": 0.19943707277844794, "grad_norm": 0.8466247916221619, "learning_rate": 1e-05, "loss": 0.0505, "step": 62 }, { "epoch": 0.20265379975874548, "grad_norm": 3.419365644454956, "learning_rate": 9.999920755303033e-06, "loss": 0.0696, "step": 63 }, { "epoch": 0.20587052673904302, "grad_norm": 2.1508781909942627, "learning_rate": 9.999683023724021e-06, "loss": 0.0502, "step": 64 }, { "epoch": 0.20908725371934056, "grad_norm": 1.2374366521835327, "learning_rate": 9.99928681279855e-06, "loss": 0.0536, "step": 65 }, { "epoch": 0.21230398069963813, "grad_norm": 2.0750317573547363, "learning_rate": 9.998732135085665e-06, "loss": 0.0634, "step": 66 }, { "epoch": 0.21552070767993567, "grad_norm": 1.1349234580993652, "learning_rate": 9.998019008167476e-06, "loss": 0.0472, "step": 67 }, { "epoch": 0.2187374346602332, "grad_norm": 4.443458080291748, "learning_rate": 9.99714745464859e-06, "loss": 0.0714, "step": 68 }, { "epoch": 0.22195416164053075, "grad_norm": 5.246551036834717, "learning_rate": 9.99611750215541e-06, "loss": 0.0686, "step": 69 }, { "epoch": 0.22517088862082832, "grad_norm": 1.8846714496612549, "learning_rate": 9.994929183335237e-06, "loss": 0.0707, "step": 70 }, { "epoch": 0.22838761560112586, "grad_norm": 2.367842674255371, "learning_rate": 9.993582535855265e-06, "loss": 0.0596, "step": 71 }, { "epoch": 0.2316043425814234, "grad_norm": 3.4915974140167236, "learning_rate": 9.992077602401358e-06, "loss": 0.0644, "step": 72 }, { "epoch": 0.23482106956172094, "grad_norm": 1.2369784116744995, "learning_rate": 9.990414430676716e-06, "loss": 0.0566, "step": 73 }, { "epoch": 0.2380377965420185, "grad_norm": 1.0058597326278687, "learning_rate": 9.988593073400354e-06, "loss": 0.0518, "step": 74 }, { "epoch": 0.24125452352231605, "grad_norm": 1.170558214187622, "learning_rate": 9.986613588305435e-06, "loss": 0.0481, "step": 75 }, { "epoch": 0.2444712505026136, "grad_norm": 0.9504396915435791, "learning_rate": 9.984476038137437e-06, "loss": 0.0458, "step": 76 }, { "epoch": 0.24768797748291113, "grad_norm": 4.625006675720215, "learning_rate": 9.982180490652165e-06, "loss": 0.0663, "step": 77 }, { "epoch": 0.25090470446320867, "grad_norm": 2.3191418647766113, "learning_rate": 9.979727018613607e-06, "loss": 0.0439, "step": 78 }, { "epoch": 0.25412143144350624, "grad_norm": 1.6738172769546509, "learning_rate": 9.977115699791622e-06, "loss": 0.0668, "step": 79 }, { "epoch": 0.25733815842380375, "grad_norm": 2.8376853466033936, "learning_rate": 9.974346616959476e-06, "loss": 0.0612, "step": 80 }, { "epoch": 0.2605548854041013, "grad_norm": 4.625660419464111, "learning_rate": 9.971419857891223e-06, "loss": 0.0602, "step": 81 }, { "epoch": 0.2637716123843989, "grad_norm": 6.061123847961426, "learning_rate": 9.968335515358916e-06, "loss": 0.0623, "step": 82 }, { "epoch": 0.2669883393646964, "grad_norm": 1.5065197944641113, "learning_rate": 9.965093687129669e-06, "loss": 0.0611, "step": 83 }, { "epoch": 0.27020506634499397, "grad_norm": 0.9030428528785706, "learning_rate": 9.961694475962562e-06, "loss": 0.0623, "step": 84 }, { "epoch": 0.27342179332529154, "grad_norm": 5.082237243652344, "learning_rate": 9.95813798960538e-06, "loss": 0.0758, "step": 85 }, { "epoch": 0.27663852030558905, "grad_norm": 5.320428848266602, "learning_rate": 9.954424340791195e-06, "loss": 0.0675, "step": 86 }, { "epoch": 0.2798552472858866, "grad_norm": 6.164220809936523, "learning_rate": 9.950553647234798e-06, "loss": 0.0713, "step": 87 }, { "epoch": 0.28307197426618413, "grad_norm": 3.148167371749878, "learning_rate": 9.94652603162896e-06, "loss": 0.0577, "step": 88 }, { "epoch": 0.2862887012464817, "grad_norm": 1.503680944442749, "learning_rate": 9.942341621640558e-06, "loss": 0.062, "step": 89 }, { "epoch": 0.28950542822677927, "grad_norm": 2.4841108322143555, "learning_rate": 9.938000549906509e-06, "loss": 0.0594, "step": 90 }, { "epoch": 0.2927221552070768, "grad_norm": 4.777047157287598, "learning_rate": 9.93350295402958e-06, "loss": 0.0601, "step": 91 }, { "epoch": 0.29593888218737435, "grad_norm": 4.280494213104248, "learning_rate": 9.92884897657402e-06, "loss": 0.0654, "step": 92 }, { "epoch": 0.2991556091676719, "grad_norm": 3.8119473457336426, "learning_rate": 9.924038765061042e-06, "loss": 0.0645, "step": 93 }, { "epoch": 0.30237233614796943, "grad_norm": 1.2760810852050781, "learning_rate": 9.919072471964146e-06, "loss": 0.0502, "step": 94 }, { "epoch": 0.305589063128267, "grad_norm": 2.0416059494018555, "learning_rate": 9.913950254704291e-06, "loss": 0.0601, "step": 95 }, { "epoch": 0.3088057901085645, "grad_norm": 2.9436042308807373, "learning_rate": 9.908672275644898e-06, "loss": 0.0552, "step": 96 }, { "epoch": 0.3120225170888621, "grad_norm": 2.80222487449646, "learning_rate": 9.903238702086707e-06, "loss": 0.0534, "step": 97 }, { "epoch": 0.31523924406915965, "grad_norm": 2.396406888961792, "learning_rate": 9.897649706262474e-06, "loss": 0.0516, "step": 98 }, { "epoch": 0.31845597104945716, "grad_norm": 1.1176875829696655, "learning_rate": 9.89190546533151e-06, "loss": 0.0544, "step": 99 }, { "epoch": 0.32167269802975473, "grad_norm": 2.592433452606201, "learning_rate": 9.88600616137407e-06, "loss": 0.0493, "step": 100 }, { "epoch": 0.32488942501005225, "grad_norm": 2.838442802429199, "learning_rate": 9.879951981385577e-06, "loss": 0.0628, "step": 101 }, { "epoch": 0.3281061519903498, "grad_norm": 3.090041399002075, "learning_rate": 9.873743117270691e-06, "loss": 0.0426, "step": 102 }, { "epoch": 0.3313228789706474, "grad_norm": 3.3808889389038086, "learning_rate": 9.867379765837237e-06, "loss": 0.0579, "step": 103 }, { "epoch": 0.3345396059509449, "grad_norm": 0.756629228591919, "learning_rate": 9.860862128789954e-06, "loss": 0.0537, "step": 104 }, { "epoch": 0.33775633293124246, "grad_norm": 1.3207106590270996, "learning_rate": 9.854190412724114e-06, "loss": 0.0406, "step": 105 }, { "epoch": 0.34097305991154003, "grad_norm": 2.726410388946533, "learning_rate": 9.847364829118963e-06, "loss": 0.0603, "step": 106 }, { "epoch": 0.34418978689183755, "grad_norm": 1.6553726196289062, "learning_rate": 9.840385594331022e-06, "loss": 0.0635, "step": 107 }, { "epoch": 0.3474065138721351, "grad_norm": 0.6768801808357239, "learning_rate": 9.833252929587231e-06, "loss": 0.0538, "step": 108 }, { "epoch": 0.3506232408524326, "grad_norm": 2.1218836307525635, "learning_rate": 9.825967060977933e-06, "loss": 0.0503, "step": 109 }, { "epoch": 0.3538399678327302, "grad_norm": 2.7606120109558105, "learning_rate": 9.818528219449705e-06, "loss": 0.05, "step": 110 }, { "epoch": 0.35705669481302776, "grad_norm": 1.1629998683929443, "learning_rate": 9.810936640798046e-06, "loss": 0.0456, "step": 111 }, { "epoch": 0.3602734217933253, "grad_norm": 1.027559757232666, "learning_rate": 9.803192565659898e-06, "loss": 0.0522, "step": 112 }, { "epoch": 0.36349014877362285, "grad_norm": 0.8349726796150208, "learning_rate": 9.795296239506011e-06, "loss": 0.0424, "step": 113 }, { "epoch": 0.36670687575392036, "grad_norm": 1.19048273563385, "learning_rate": 9.78724791263318e-06, "loss": 0.0472, "step": 114 }, { "epoch": 0.3699236027342179, "grad_norm": 0.9269819259643555, "learning_rate": 9.779047840156288e-06, "loss": 0.0528, "step": 115 }, { "epoch": 0.3731403297145155, "grad_norm": 0.7386018633842468, "learning_rate": 9.770696282000245e-06, "loss": 0.0561, "step": 116 }, { "epoch": 0.376357056694813, "grad_norm": 0.8170301914215088, "learning_rate": 9.762193502891726e-06, "loss": 0.0457, "step": 117 }, { "epoch": 0.3795737836751106, "grad_norm": 1.0884501934051514, "learning_rate": 9.753539772350792e-06, "loss": 0.0467, "step": 118 }, { "epoch": 0.38279051065540814, "grad_norm": 1.1235942840576172, "learning_rate": 9.744735364682347e-06, "loss": 0.0406, "step": 119 }, { "epoch": 0.38600723763570566, "grad_norm": 0.8838767409324646, "learning_rate": 9.735780558967434e-06, "loss": 0.0487, "step": 120 }, { "epoch": 0.3892239646160032, "grad_norm": 0.9004449248313904, "learning_rate": 9.726675639054403e-06, "loss": 0.0543, "step": 121 }, { "epoch": 0.39244069159630074, "grad_norm": 1.97210693359375, "learning_rate": 9.717420893549902e-06, "loss": 0.0493, "step": 122 }, { "epoch": 0.3956574185765983, "grad_norm": 0.7708803415298462, "learning_rate": 9.70801661580973e-06, "loss": 0.0437, "step": 123 }, { "epoch": 0.3988741455568959, "grad_norm": 1.6689863204956055, "learning_rate": 9.698463103929542e-06, "loss": 0.0455, "step": 124 }, { "epoch": 0.4020908725371934, "grad_norm": 1.5968708992004395, "learning_rate": 9.688760660735403e-06, "loss": 0.0569, "step": 125 }, { "epoch": 0.40530759951749096, "grad_norm": 1.1041440963745117, "learning_rate": 9.67890959377418e-06, "loss": 0.0576, "step": 126 }, { "epoch": 0.4085243264977885, "grad_norm": 2.1897428035736084, "learning_rate": 9.668910215303797e-06, "loss": 0.0514, "step": 127 }, { "epoch": 0.41174105347808604, "grad_norm": 1.594397783279419, "learning_rate": 9.658762842283343e-06, "loss": 0.0444, "step": 128 }, { "epoch": 0.4149577804583836, "grad_norm": 1.404996395111084, "learning_rate": 9.648467796363019e-06, "loss": 0.0493, "step": 129 }, { "epoch": 0.4181745074386811, "grad_norm": 0.889514148235321, "learning_rate": 9.638025403873939e-06, "loss": 0.0576, "step": 130 }, { "epoch": 0.4213912344189787, "grad_norm": 3.748598337173462, "learning_rate": 9.627435995817799e-06, "loss": 0.0599, "step": 131 }, { "epoch": 0.42460796139927626, "grad_norm": 4.13529109954834, "learning_rate": 9.616699907856368e-06, "loss": 0.0625, "step": 132 }, { "epoch": 0.42782468837957377, "grad_norm": 2.6388087272644043, "learning_rate": 9.605817480300863e-06, "loss": 0.0536, "step": 133 }, { "epoch": 0.43104141535987134, "grad_norm": 1.5752240419387817, "learning_rate": 9.594789058101154e-06, "loss": 0.0636, "step": 134 }, { "epoch": 0.43425814234016885, "grad_norm": 1.7247588634490967, "learning_rate": 9.58361499083483e-06, "loss": 0.0602, "step": 135 }, { "epoch": 0.4374748693204664, "grad_norm": 1.2270214557647705, "learning_rate": 9.57229563269612e-06, "loss": 0.0543, "step": 136 }, { "epoch": 0.440691596300764, "grad_norm": 1.9172810316085815, "learning_rate": 9.560831342484668e-06, "loss": 0.0456, "step": 137 }, { "epoch": 0.4439083232810615, "grad_norm": 5.208808898925781, "learning_rate": 9.549222483594154e-06, "loss": 0.0868, "step": 138 }, { "epoch": 0.44712505026135907, "grad_norm": 4.008519172668457, "learning_rate": 9.53746942400078e-06, "loss": 0.0646, "step": 139 }, { "epoch": 0.45034177724165664, "grad_norm": 2.6669585704803467, "learning_rate": 9.525572536251608e-06, "loss": 0.068, "step": 140 }, { "epoch": 0.45355850422195415, "grad_norm": 0.6555745601654053, "learning_rate": 9.513532197452737e-06, "loss": 0.0539, "step": 141 }, { "epoch": 0.4567752312022517, "grad_norm": 3.599552869796753, "learning_rate": 9.501348789257373e-06, "loss": 0.0515, "step": 142 }, { "epoch": 0.45999195818254923, "grad_norm": 5.188879489898682, "learning_rate": 9.48902269785371e-06, "loss": 0.0754, "step": 143 }, { "epoch": 0.4632086851628468, "grad_norm": 3.3630921840667725, "learning_rate": 9.476554313952697e-06, "loss": 0.0651, "step": 144 }, { "epoch": 0.46642541214314437, "grad_norm": 2.4736194610595703, "learning_rate": 9.46394403277566e-06, "loss": 0.0578, "step": 145 }, { "epoch": 0.4696421391234419, "grad_norm": 0.7932046055793762, "learning_rate": 9.451192254041759e-06, "loss": 0.0466, "step": 146 }, { "epoch": 0.47285886610373945, "grad_norm": 0.980038583278656, "learning_rate": 9.438299381955333e-06, "loss": 0.0452, "step": 147 }, { "epoch": 0.476075593084037, "grad_norm": 1.6979252099990845, "learning_rate": 9.425265825193077e-06, "loss": 0.0445, "step": 148 }, { "epoch": 0.47929232006433453, "grad_norm": 2.8460330963134766, "learning_rate": 9.412091996891097e-06, "loss": 0.051, "step": 149 }, { "epoch": 0.4825090470446321, "grad_norm": 1.3673274517059326, "learning_rate": 9.398778314631801e-06, "loss": 0.0473, "step": 150 }, { "epoch": 0.4857257740249296, "grad_norm": 1.6840636730194092, "learning_rate": 9.385325200430679e-06, "loss": 0.0525, "step": 151 }, { "epoch": 0.4889425010052272, "grad_norm": 2.1259257793426514, "learning_rate": 9.371733080722911e-06, "loss": 0.0548, "step": 152 }, { "epoch": 0.49215922798552475, "grad_norm": 0.7495786547660828, "learning_rate": 9.358002386349862e-06, "loss": 0.0379, "step": 153 }, { "epoch": 0.49537595496582226, "grad_norm": 1.248661756515503, "learning_rate": 9.34413355254542e-06, "loss": 0.0477, "step": 154 }, { "epoch": 0.49859268194611983, "grad_norm": 1.250671148300171, "learning_rate": 9.330127018922195e-06, "loss": 0.0524, "step": 155 }, { "epoch": 0.5018094089264173, "grad_norm": 0.5698635578155518, "learning_rate": 9.31598322945759e-06, "loss": 0.0508, "step": 156 }, { "epoch": 0.505026135906715, "grad_norm": 0.6476792097091675, "learning_rate": 9.301702632479734e-06, "loss": 0.0482, "step": 157 }, { "epoch": 0.5082428628870125, "grad_norm": 0.6281876564025879, "learning_rate": 9.287285680653254e-06, "loss": 0.0448, "step": 158 }, { "epoch": 0.51145958986731, "grad_norm": 0.8750901222229004, "learning_rate": 9.272732830964948e-06, "loss": 0.0652, "step": 159 }, { "epoch": 0.5146763168476075, "grad_norm": 0.6696067452430725, "learning_rate": 9.258044544709276e-06, "loss": 0.0432, "step": 160 }, { "epoch": 0.5178930438279051, "grad_norm": 0.5400009155273438, "learning_rate": 9.243221287473755e-06, "loss": 0.0389, "step": 161 }, { "epoch": 0.5211097708082026, "grad_norm": 0.59368497133255, "learning_rate": 9.228263529124199e-06, "loss": 0.0407, "step": 162 }, { "epoch": 0.5243264977885002, "grad_norm": 0.8467090725898743, "learning_rate": 9.21317174378982e-06, "loss": 0.0489, "step": 163 }, { "epoch": 0.5275432247687978, "grad_norm": 0.6505163908004761, "learning_rate": 9.197946409848196e-06, "loss": 0.0448, "step": 164 }, { "epoch": 0.5307599517490953, "grad_norm": 1.452300786972046, "learning_rate": 9.182588009910119e-06, "loss": 0.0487, "step": 165 }, { "epoch": 0.5339766787293928, "grad_norm": 1.1281909942626953, "learning_rate": 9.167097030804289e-06, "loss": 0.0468, "step": 166 }, { "epoch": 0.5371934057096904, "grad_norm": 1.3461655378341675, "learning_rate": 9.151473963561884e-06, "loss": 0.0589, "step": 167 }, { "epoch": 0.5404101326899879, "grad_norm": 0.5634208917617798, "learning_rate": 9.135719303400995e-06, "loss": 0.0405, "step": 168 }, { "epoch": 0.5436268596702855, "grad_norm": 1.198769450187683, "learning_rate": 9.119833549710927e-06, "loss": 0.0635, "step": 169 }, { "epoch": 0.5468435866505831, "grad_norm": 1.4971657991409302, "learning_rate": 9.103817206036383e-06, "loss": 0.05, "step": 170 }, { "epoch": 0.5500603136308806, "grad_norm": 0.9563581347465515, "learning_rate": 9.087670780061477e-06, "loss": 0.0555, "step": 171 }, { "epoch": 0.5532770406111781, "grad_norm": 0.7893424034118652, "learning_rate": 9.071394783593664e-06, "loss": 0.0495, "step": 172 }, { "epoch": 0.5564937675914756, "grad_norm": 1.9164668321609497, "learning_rate": 9.054989732547507e-06, "loss": 0.0519, "step": 173 }, { "epoch": 0.5597104945717732, "grad_norm": 0.7731283903121948, "learning_rate": 9.038456146928325e-06, "loss": 0.047, "step": 174 }, { "epoch": 0.5629272215520708, "grad_norm": 1.0580449104309082, "learning_rate": 9.021794550815713e-06, "loss": 0.052, "step": 175 }, { "epoch": 0.5661439485323683, "grad_norm": 0.7994678616523743, "learning_rate": 9.005005472346923e-06, "loss": 0.0587, "step": 176 }, { "epoch": 0.5693606755126659, "grad_norm": 1.5067932605743408, "learning_rate": 8.988089443700131e-06, "loss": 0.0426, "step": 177 }, { "epoch": 0.5725774024929634, "grad_norm": 1.0666264295578003, "learning_rate": 8.971047001077561e-06, "loss": 0.0432, "step": 178 }, { "epoch": 0.5757941294732609, "grad_norm": 2.072291135787964, "learning_rate": 8.953878684688492e-06, "loss": 0.0467, "step": 179 }, { "epoch": 0.5790108564535585, "grad_norm": 0.4865788221359253, "learning_rate": 8.936585038732143e-06, "loss": 0.0375, "step": 180 }, { "epoch": 0.582227583433856, "grad_norm": 2.6773552894592285, "learning_rate": 8.919166611380397e-06, "loss": 0.065, "step": 181 }, { "epoch": 0.5854443104141536, "grad_norm": 3.049079656600952, "learning_rate": 8.90162395476046e-06, "loss": 0.0652, "step": 182 }, { "epoch": 0.5886610373944512, "grad_norm": 1.1698511838912964, "learning_rate": 8.883957624937333e-06, "loss": 0.0519, "step": 183 }, { "epoch": 0.5918777643747487, "grad_norm": 1.1253437995910645, "learning_rate": 8.866168181896198e-06, "loss": 0.0562, "step": 184 }, { "epoch": 0.5950944913550462, "grad_norm": 0.5882744789123535, "learning_rate": 8.848256189524661e-06, "loss": 0.0472, "step": 185 }, { "epoch": 0.5983112183353438, "grad_norm": 0.5303323864936829, "learning_rate": 8.83022221559489e-06, "loss": 0.0359, "step": 186 }, { "epoch": 0.6015279453156414, "grad_norm": 0.8436638712882996, "learning_rate": 8.812066831745602e-06, "loss": 0.0335, "step": 187 }, { "epoch": 0.6047446722959389, "grad_norm": 0.7851281762123108, "learning_rate": 8.793790613463956e-06, "loss": 0.0459, "step": 188 }, { "epoch": 0.6079613992762364, "grad_norm": 0.7391752600669861, "learning_rate": 8.775394140067299e-06, "loss": 0.0445, "step": 189 }, { "epoch": 0.611178126256534, "grad_norm": 0.9071338772773743, "learning_rate": 8.756877994684818e-06, "loss": 0.0468, "step": 190 }, { "epoch": 0.6143948532368315, "grad_norm": 1.225618600845337, "learning_rate": 8.738242764239046e-06, "loss": 0.0433, "step": 191 }, { "epoch": 0.617611580217129, "grad_norm": 1.5072846412658691, "learning_rate": 8.719489039427256e-06, "loss": 0.0467, "step": 192 }, { "epoch": 0.6208283071974267, "grad_norm": 0.7594591975212097, "learning_rate": 8.700617414702746e-06, "loss": 0.0407, "step": 193 }, { "epoch": 0.6240450341777242, "grad_norm": 1.7316999435424805, "learning_rate": 8.681628488255986e-06, "loss": 0.0524, "step": 194 }, { "epoch": 0.6272617611580217, "grad_norm": 0.6949440240859985, "learning_rate": 8.66252286199567e-06, "loss": 0.0354, "step": 195 }, { "epoch": 0.6304784881383193, "grad_norm": 0.6324911117553711, "learning_rate": 8.643301141529619e-06, "loss": 0.0525, "step": 196 }, { "epoch": 0.6336952151186168, "grad_norm": 0.6617627739906311, "learning_rate": 8.6239639361456e-06, "loss": 0.0471, "step": 197 }, { "epoch": 0.6369119420989143, "grad_norm": 1.0735185146331787, "learning_rate": 8.604511858792006e-06, "loss": 0.0347, "step": 198 }, { "epoch": 0.640128669079212, "grad_norm": 0.6503901481628418, "learning_rate": 8.584945526058426e-06, "loss": 0.0621, "step": 199 }, { "epoch": 0.6433453960595095, "grad_norm": 0.5145460367202759, "learning_rate": 8.565265558156101e-06, "loss": 0.0444, "step": 200 }, { "epoch": 0.646562123039807, "grad_norm": 0.5394482016563416, "learning_rate": 8.545472578898276e-06, "loss": 0.0505, "step": 201 }, { "epoch": 0.6497788500201045, "grad_norm": 0.9032294750213623, "learning_rate": 8.525567215680397e-06, "loss": 0.0522, "step": 202 }, { "epoch": 0.6529955770004021, "grad_norm": 0.4706067144870758, "learning_rate": 8.505550099460264e-06, "loss": 0.0388, "step": 203 }, { "epoch": 0.6562123039806996, "grad_norm": 0.602165699005127, "learning_rate": 8.485421864737997e-06, "loss": 0.0557, "step": 204 }, { "epoch": 0.6594290309609971, "grad_norm": 1.2543528079986572, "learning_rate": 8.465183149535939e-06, "loss": 0.0539, "step": 205 }, { "epoch": 0.6626457579412948, "grad_norm": 1.1637508869171143, "learning_rate": 8.444834595378434e-06, "loss": 0.0526, "step": 206 }, { "epoch": 0.6658624849215923, "grad_norm": 2.013043165206909, "learning_rate": 8.424376847271483e-06, "loss": 0.0523, "step": 207 }, { "epoch": 0.6690792119018898, "grad_norm": 1.166269302368164, "learning_rate": 8.403810553682307e-06, "loss": 0.0482, "step": 208 }, { "epoch": 0.6722959388821874, "grad_norm": 1.0098134279251099, "learning_rate": 8.383136366518788e-06, "loss": 0.0477, "step": 209 }, { "epoch": 0.6755126658624849, "grad_norm": 0.7044484615325928, "learning_rate": 8.362354941108803e-06, "loss": 0.05, "step": 210 }, { "epoch": 0.6787293928427824, "grad_norm": 0.5775832533836365, "learning_rate": 8.341466936179457e-06, "loss": 0.0532, "step": 211 }, { "epoch": 0.6819461198230801, "grad_norm": 1.780887246131897, "learning_rate": 8.320473013836197e-06, "loss": 0.0385, "step": 212 }, { "epoch": 0.6851628468033776, "grad_norm": 0.6344701051712036, "learning_rate": 8.299373839541829e-06, "loss": 0.0345, "step": 213 }, { "epoch": 0.6883795737836751, "grad_norm": 0.597938597202301, "learning_rate": 8.278170082095422e-06, "loss": 0.0549, "step": 214 }, { "epoch": 0.6915963007639726, "grad_norm": 0.5144065022468567, "learning_rate": 8.256862413611113e-06, "loss": 0.0368, "step": 215 }, { "epoch": 0.6948130277442702, "grad_norm": 0.5459667444229126, "learning_rate": 8.23545150949679e-06, "loss": 0.0396, "step": 216 }, { "epoch": 0.6980297547245677, "grad_norm": 1.465198278427124, "learning_rate": 8.213938048432697e-06, "loss": 0.0534, "step": 217 }, { "epoch": 0.7012464817048653, "grad_norm": 0.9042430520057678, "learning_rate": 8.192322712349917e-06, "loss": 0.0463, "step": 218 }, { "epoch": 0.7044632086851629, "grad_norm": 1.0576472282409668, "learning_rate": 8.170606186408756e-06, "loss": 0.0392, "step": 219 }, { "epoch": 0.7076799356654604, "grad_norm": 0.7928400039672852, "learning_rate": 8.148789158977012e-06, "loss": 0.056, "step": 220 }, { "epoch": 0.7108966626457579, "grad_norm": 1.0211706161499023, "learning_rate": 8.126872321608185e-06, "loss": 0.059, "step": 221 }, { "epoch": 0.7141133896260555, "grad_norm": 0.5951483845710754, "learning_rate": 8.104856369019525e-06, "loss": 0.0493, "step": 222 }, { "epoch": 0.717330116606353, "grad_norm": 1.3035422563552856, "learning_rate": 8.08274199907003e-06, "loss": 0.0411, "step": 223 }, { "epoch": 0.7205468435866506, "grad_norm": 0.7945671081542969, "learning_rate": 8.060529912738316e-06, "loss": 0.0419, "step": 224 }, { "epoch": 0.7237635705669482, "grad_norm": 0.6303293704986572, "learning_rate": 8.038220814100403e-06, "loss": 0.0504, "step": 225 }, { "epoch": 0.7269802975472457, "grad_norm": 1.6979130506515503, "learning_rate": 8.0158154103074e-06, "loss": 0.0512, "step": 226 }, { "epoch": 0.7301970245275432, "grad_norm": 0.7927567362785339, "learning_rate": 7.993314411563075e-06, "loss": 0.0559, "step": 227 }, { "epoch": 0.7334137515078407, "grad_norm": 0.7262939214706421, "learning_rate": 7.970718531101365e-06, "loss": 0.0439, "step": 228 }, { "epoch": 0.7366304784881383, "grad_norm": 0.7317245006561279, "learning_rate": 7.948028485163744e-06, "loss": 0.0399, "step": 229 }, { "epoch": 0.7398472054684359, "grad_norm": 0.9586815237998962, "learning_rate": 7.925244992976538e-06, "loss": 0.0573, "step": 230 }, { "epoch": 0.7430639324487334, "grad_norm": 1.5452810525894165, "learning_rate": 7.902368776728125e-06, "loss": 0.0464, "step": 231 }, { "epoch": 0.746280659429031, "grad_norm": 0.8795072436332703, "learning_rate": 7.879400561546033e-06, "loss": 0.0453, "step": 232 }, { "epoch": 0.7494973864093285, "grad_norm": 0.648480236530304, "learning_rate": 7.856341075473963e-06, "loss": 0.0467, "step": 233 }, { "epoch": 0.752714113389626, "grad_norm": 0.646035373210907, "learning_rate": 7.833191049448706e-06, "loss": 0.0377, "step": 234 }, { "epoch": 0.7559308403699236, "grad_norm": 1.1588225364685059, "learning_rate": 7.809951217276986e-06, "loss": 0.0427, "step": 235 }, { "epoch": 0.7591475673502212, "grad_norm": 0.6012084484100342, "learning_rate": 7.786622315612182e-06, "loss": 0.0433, "step": 236 }, { "epoch": 0.7623642943305187, "grad_norm": 0.5877156853675842, "learning_rate": 7.763205083930995e-06, "loss": 0.0394, "step": 237 }, { "epoch": 0.7655810213108163, "grad_norm": 0.7161210179328918, "learning_rate": 7.739700264509993e-06, "loss": 0.0379, "step": 238 }, { "epoch": 0.7687977482911138, "grad_norm": 0.819977343082428, "learning_rate": 7.716108602402094e-06, "loss": 0.0599, "step": 239 }, { "epoch": 0.7720144752714113, "grad_norm": 0.6733528971672058, "learning_rate": 7.692430845412946e-06, "loss": 0.0465, "step": 240 }, { "epoch": 0.7752312022517089, "grad_norm": 0.6228499412536621, "learning_rate": 7.668667744077215e-06, "loss": 0.0478, "step": 241 }, { "epoch": 0.7784479292320065, "grad_norm": 0.8519704341888428, "learning_rate": 7.644820051634813e-06, "loss": 0.0503, "step": 242 }, { "epoch": 0.781664656212304, "grad_norm": 0.966986358165741, "learning_rate": 7.6208885240069995e-06, "loss": 0.0412, "step": 243 }, { "epoch": 0.7848813831926015, "grad_norm": 0.8694539070129395, "learning_rate": 7.596873919772438e-06, "loss": 0.0436, "step": 244 }, { "epoch": 0.7880981101728991, "grad_norm": 0.8280279040336609, "learning_rate": 7.572777000143145e-06, "loss": 0.0347, "step": 245 }, { "epoch": 0.7913148371531966, "grad_norm": 1.55968177318573, "learning_rate": 7.548598528940354e-06, "loss": 0.0478, "step": 246 }, { "epoch": 0.7945315641334941, "grad_norm": 0.5930286049842834, "learning_rate": 7.524339272570317e-06, "loss": 0.0511, "step": 247 }, { "epoch": 0.7977482911137918, "grad_norm": 1.2295900583267212, "learning_rate": 7.500000000000001e-06, "loss": 0.055, "step": 248 }, { "epoch": 0.8009650180940893, "grad_norm": 1.3309237957000732, "learning_rate": 7.475581482732717e-06, "loss": 0.0557, "step": 249 }, { "epoch": 0.8041817450743868, "grad_norm": 0.7122506499290466, "learning_rate": 7.451084494783668e-06, "loss": 0.0469, "step": 250 }, { "epoch": 0.8073984720546844, "grad_norm": 1.1560570001602173, "learning_rate": 7.4265098126554065e-06, "loss": 0.0489, "step": 251 }, { "epoch": 0.8106151990349819, "grad_norm": 1.4469757080078125, "learning_rate": 7.401858215313228e-06, "loss": 0.0501, "step": 252 }, { "epoch": 0.8138319260152794, "grad_norm": 0.8810122609138489, "learning_rate": 7.3771304841604764e-06, "loss": 0.0317, "step": 253 }, { "epoch": 0.817048652995577, "grad_norm": 0.5365167260169983, "learning_rate": 7.352327403013779e-06, "loss": 0.0414, "step": 254 }, { "epoch": 0.8202653799758746, "grad_norm": 0.6032179594039917, "learning_rate": 7.327449758078194e-06, "loss": 0.0413, "step": 255 }, { "epoch": 0.8234821069561721, "grad_norm": 0.7471963763237, "learning_rate": 7.302498337922293e-06, "loss": 0.0406, "step": 256 }, { "epoch": 0.8266988339364696, "grad_norm": 0.5315238237380981, "learning_rate": 7.27747393345317e-06, "loss": 0.0386, "step": 257 }, { "epoch": 0.8299155609167672, "grad_norm": 0.7365145683288574, "learning_rate": 7.2523773378913655e-06, "loss": 0.0554, "step": 258 }, { "epoch": 0.8331322878970647, "grad_norm": 1.0224618911743164, "learning_rate": 7.2272093467457226e-06, "loss": 0.0434, "step": 259 }, { "epoch": 0.8363490148773622, "grad_norm": 1.6006726026535034, "learning_rate": 7.201970757788172e-06, "loss": 0.0486, "step": 260 }, { "epoch": 0.8395657418576599, "grad_norm": 1.2492550611495972, "learning_rate": 7.17666237102845e-06, "loss": 0.0448, "step": 261 }, { "epoch": 0.8427824688379574, "grad_norm": 2.2926084995269775, "learning_rate": 7.151284988688731e-06, "loss": 0.044, "step": 262 }, { "epoch": 0.8459991958182549, "grad_norm": 0.7440332174301147, "learning_rate": 7.125839415178204e-06, "loss": 0.0477, "step": 263 }, { "epoch": 0.8492159227985525, "grad_norm": 0.717848002910614, "learning_rate": 7.100326457067576e-06, "loss": 0.0396, "step": 264 }, { "epoch": 0.85243264977885, "grad_norm": 0.6726682782173157, "learning_rate": 7.074746923063497e-06, "loss": 0.051, "step": 265 }, { "epoch": 0.8556493767591475, "grad_norm": 0.8255169987678528, "learning_rate": 7.049101623982938e-06, "loss": 0.0445, "step": 266 }, { "epoch": 0.8588661037394452, "grad_norm": 0.7894156575202942, "learning_rate": 7.02339137272748e-06, "loss": 0.0378, "step": 267 }, { "epoch": 0.8620828307197427, "grad_norm": 0.8157196640968323, "learning_rate": 6.9976169842575526e-06, "loss": 0.0395, "step": 268 }, { "epoch": 0.8652995577000402, "grad_norm": 0.613278329372406, "learning_rate": 6.971779275566593e-06, "loss": 0.032, "step": 269 }, { "epoch": 0.8685162846803377, "grad_norm": 0.6524770855903625, "learning_rate": 6.945879065655164e-06, "loss": 0.0463, "step": 270 }, { "epoch": 0.8717330116606353, "grad_norm": 0.8012259602546692, "learning_rate": 6.919917175504978e-06, "loss": 0.0556, "step": 271 }, { "epoch": 0.8749497386409328, "grad_norm": 0.8766384720802307, "learning_rate": 6.893894428052881e-06, "loss": 0.0534, "step": 272 }, { "epoch": 0.8781664656212304, "grad_norm": 0.6129264831542969, "learning_rate": 6.867811648164769e-06, "loss": 0.0486, "step": 273 }, { "epoch": 0.881383192601528, "grad_norm": 0.7514522075653076, "learning_rate": 6.841669662609437e-06, "loss": 0.0464, "step": 274 }, { "epoch": 0.8845999195818255, "grad_norm": 0.5673539042472839, "learning_rate": 6.815469300032374e-06, "loss": 0.0338, "step": 275 }, { "epoch": 0.887816646562123, "grad_norm": 0.5022168159484863, "learning_rate": 6.789211390929497e-06, "loss": 0.0283, "step": 276 }, { "epoch": 0.8910333735424206, "grad_norm": 1.195987343788147, "learning_rate": 6.762896767620827e-06, "loss": 0.0488, "step": 277 }, { "epoch": 0.8942501005227181, "grad_norm": 1.0466563701629639, "learning_rate": 6.736526264224101e-06, "loss": 0.0412, "step": 278 }, { "epoch": 0.8974668275030157, "grad_norm": 0.6366952061653137, "learning_rate": 6.710100716628345e-06, "loss": 0.0403, "step": 279 }, { "epoch": 0.9006835544833133, "grad_norm": 1.0278160572052002, "learning_rate": 6.6836209624673575e-06, "loss": 0.0527, "step": 280 }, { "epoch": 0.9039002814636108, "grad_norm": 0.8810920715332031, "learning_rate": 6.657087841093179e-06, "loss": 0.0517, "step": 281 }, { "epoch": 0.9071170084439083, "grad_norm": 1.306876301765442, "learning_rate": 6.6305021935494755e-06, "loss": 0.041, "step": 282 }, { "epoch": 0.9103337354242059, "grad_norm": 1.134096622467041, "learning_rate": 6.603864862544879e-06, "loss": 0.0474, "step": 283 }, { "epoch": 0.9135504624045034, "grad_norm": 1.364350438117981, "learning_rate": 6.5771766924262795e-06, "loss": 0.0418, "step": 284 }, { "epoch": 0.916767189384801, "grad_norm": 1.0049413442611694, "learning_rate": 6.5504385291520554e-06, "loss": 0.0386, "step": 285 }, { "epoch": 0.9199839163650985, "grad_norm": 0.5477365255355835, "learning_rate": 6.523651220265269e-06, "loss": 0.0394, "step": 286 }, { "epoch": 0.9232006433453961, "grad_norm": 1.8029471635818481, "learning_rate": 6.496815614866792e-06, "loss": 0.0486, "step": 287 }, { "epoch": 0.9264173703256936, "grad_norm": 1.1982808113098145, "learning_rate": 6.469932563588386e-06, "loss": 0.0385, "step": 288 }, { "epoch": 0.9296340973059911, "grad_norm": 1.8659429550170898, "learning_rate": 6.443002918565754e-06, "loss": 0.05, "step": 289 }, { "epoch": 0.9328508242862887, "grad_norm": 0.9505304098129272, "learning_rate": 6.41602753341152e-06, "loss": 0.0439, "step": 290 }, { "epoch": 0.9360675512665863, "grad_norm": 1.1647857427597046, "learning_rate": 6.389007263188176e-06, "loss": 0.0472, "step": 291 }, { "epoch": 0.9392842782468838, "grad_norm": 1.5960369110107422, "learning_rate": 6.361942964380967e-06, "loss": 0.0454, "step": 292 }, { "epoch": 0.9425010052271814, "grad_norm": 0.6902590990066528, "learning_rate": 6.334835494870759e-06, "loss": 0.0351, "step": 293 }, { "epoch": 0.9457177322074789, "grad_norm": 0.8172485828399658, "learning_rate": 6.307685713906835e-06, "loss": 0.0344, "step": 294 }, { "epoch": 0.9489344591877764, "grad_norm": 0.46139606833457947, "learning_rate": 6.2804944820796596e-06, "loss": 0.0308, "step": 295 }, { "epoch": 0.952151186168074, "grad_norm": 0.7500118613243103, "learning_rate": 6.2532626612936035e-06, "loss": 0.0541, "step": 296 }, { "epoch": 0.9553679131483716, "grad_norm": 0.693679928779602, "learning_rate": 6.225991114739622e-06, "loss": 0.0351, "step": 297 }, { "epoch": 0.9585846401286691, "grad_norm": 1.401061773300171, "learning_rate": 6.1986807068678926e-06, "loss": 0.0458, "step": 298 }, { "epoch": 0.9618013671089666, "grad_norm": 0.5701402425765991, "learning_rate": 6.171332303360411e-06, "loss": 0.0349, "step": 299 }, { "epoch": 0.9650180940892642, "grad_norm": 1.6502150297164917, "learning_rate": 6.143946771103561e-06, "loss": 0.0553, "step": 300 }, { "epoch": 0.9682348210695617, "grad_norm": 0.8593463897705078, "learning_rate": 6.11652497816062e-06, "loss": 0.0372, "step": 301 }, { "epoch": 0.9714515480498592, "grad_norm": 0.8635258674621582, "learning_rate": 6.089067793744258e-06, "loss": 0.0386, "step": 302 }, { "epoch": 0.9746682750301568, "grad_norm": 1.058119535446167, "learning_rate": 6.061576088188981e-06, "loss": 0.0407, "step": 303 }, { "epoch": 0.9778850020104544, "grad_norm": 1.316687822341919, "learning_rate": 6.034050732923538e-06, "loss": 0.0405, "step": 304 }, { "epoch": 0.9811017289907519, "grad_norm": 0.6700626611709595, "learning_rate": 6.006492600443301e-06, "loss": 0.0456, "step": 305 }, { "epoch": 0.9843184559710495, "grad_norm": 1.0319327116012573, "learning_rate": 5.978902564282616e-06, "loss": 0.0476, "step": 306 }, { "epoch": 0.987535182951347, "grad_norm": 0.5660614371299744, "learning_rate": 5.951281498987106e-06, "loss": 0.0431, "step": 307 }, { "epoch": 0.9907519099316445, "grad_norm": 0.5249197483062744, "learning_rate": 5.923630280085948e-06, "loss": 0.0416, "step": 308 }, { "epoch": 0.9939686369119421, "grad_norm": 0.5636727213859558, "learning_rate": 5.895949784064126e-06, "loss": 0.0316, "step": 309 }, { "epoch": 0.9971853638922397, "grad_norm": 0.5312305688858032, "learning_rate": 5.8682408883346535e-06, "loss": 0.0393, "step": 310 }, { "epoch": 1.0, "grad_norm": 0.8194127678871155, "learning_rate": 5.840504471210742e-06, "loss": 0.0507, "step": 311 }, { "epoch": 1.0032167269802976, "grad_norm": 0.4939586818218231, "learning_rate": 5.8127414118779825e-06, "loss": 0.0331, "step": 312 }, { "epoch": 1.006433453960595, "grad_norm": 0.9499027729034424, "learning_rate": 5.7849525903664636e-06, "loss": 0.0234, "step": 313 }, { "epoch": 1.0096501809408926, "grad_norm": 1.0846751928329468, "learning_rate": 5.757138887522884e-06, "loss": 0.0244, "step": 314 }, { "epoch": 1.0128669079211903, "grad_norm": 0.39767810702323914, "learning_rate": 5.729301184982622e-06, "loss": 0.0334, "step": 315 }, { "epoch": 1.0160836349014877, "grad_norm": 0.38344645500183105, "learning_rate": 5.701440365141799e-06, "loss": 0.0301, "step": 316 }, { "epoch": 1.0193003618817853, "grad_norm": 0.7626691460609436, "learning_rate": 5.673557311129306e-06, "loss": 0.0322, "step": 317 }, { "epoch": 1.022517088862083, "grad_norm": 0.5533730387687683, "learning_rate": 5.645652906778808e-06, "loss": 0.0304, "step": 318 }, { "epoch": 1.0257338158423803, "grad_norm": 0.7238726615905762, "learning_rate": 5.617728036600734e-06, "loss": 0.0219, "step": 319 }, { "epoch": 1.028950542822678, "grad_norm": 0.6678081750869751, "learning_rate": 5.5897835857542315e-06, "loss": 0.0271, "step": 320 }, { "epoch": 1.0321672698029756, "grad_norm": 1.125412940979004, "learning_rate": 5.561820440019117e-06, "loss": 0.0281, "step": 321 }, { "epoch": 1.035383996783273, "grad_norm": 0.9026764631271362, "learning_rate": 5.533839485767795e-06, "loss": 0.0181, "step": 322 }, { "epoch": 1.0386007237635706, "grad_norm": 0.6368532776832581, "learning_rate": 5.505841609937162e-06, "loss": 0.0261, "step": 323 }, { "epoch": 1.041817450743868, "grad_norm": 0.48171374201774597, "learning_rate": 5.477827700000492e-06, "loss": 0.0168, "step": 324 }, { "epoch": 1.0450341777241656, "grad_norm": 0.7617553472518921, "learning_rate": 5.449798643939305e-06, "loss": 0.0294, "step": 325 }, { "epoch": 1.0482509047044632, "grad_norm": 0.7659939527511597, "learning_rate": 5.421755330215223e-06, "loss": 0.0233, "step": 326 }, { "epoch": 1.0514676316847607, "grad_norm": 1.1631779670715332, "learning_rate": 5.39369864774181e-06, "loss": 0.031, "step": 327 }, { "epoch": 1.0546843586650583, "grad_norm": 0.5692664384841919, "learning_rate": 5.365629485856381e-06, "loss": 0.017, "step": 328 }, { "epoch": 1.057901085645356, "grad_norm": 0.8848438858985901, "learning_rate": 5.337548734291827e-06, "loss": 0.0372, "step": 329 }, { "epoch": 1.0611178126256533, "grad_norm": 0.6400728225708008, "learning_rate": 5.30945728314841e-06, "loss": 0.0213, "step": 330 }, { "epoch": 1.064334539605951, "grad_norm": 0.9902735352516174, "learning_rate": 5.281356022865542e-06, "loss": 0.0262, "step": 331 }, { "epoch": 1.0675512665862485, "grad_norm": 0.5511563420295715, "learning_rate": 5.253245844193564e-06, "loss": 0.0212, "step": 332 }, { "epoch": 1.070767993566546, "grad_norm": 0.7277644276618958, "learning_rate": 5.225127638165514e-06, "loss": 0.0164, "step": 333 }, { "epoch": 1.0739847205468436, "grad_norm": 0.5540727376937866, "learning_rate": 5.197002296068878e-06, "loss": 0.024, "step": 334 }, { "epoch": 1.0772014475271412, "grad_norm": 0.55019611120224, "learning_rate": 5.168870709417342e-06, "loss": 0.0203, "step": 335 }, { "epoch": 1.0804181745074386, "grad_norm": 0.5772176384925842, "learning_rate": 5.140733769922525e-06, "loss": 0.021, "step": 336 }, { "epoch": 1.0836349014877362, "grad_norm": 0.7441113591194153, "learning_rate": 5.112592369465731e-06, "loss": 0.0372, "step": 337 }, { "epoch": 1.0868516284680338, "grad_norm": 0.5592713952064514, "learning_rate": 5.084447400069656e-06, "loss": 0.02, "step": 338 }, { "epoch": 1.0900683554483312, "grad_norm": 0.6708924770355225, "learning_rate": 5.0562997538701295e-06, "loss": 0.0207, "step": 339 }, { "epoch": 1.0932850824286289, "grad_norm": 0.6388556361198425, "learning_rate": 5.0281503230878304e-06, "loss": 0.0234, "step": 340 }, { "epoch": 1.0965018094089265, "grad_norm": 0.5663427114486694, "learning_rate": 5e-06, "loss": 0.02, "step": 341 }, { "epoch": 1.099718536389224, "grad_norm": 1.1264530420303345, "learning_rate": 4.971849676912172e-06, "loss": 0.0243, "step": 342 }, { "epoch": 1.1029352633695215, "grad_norm": 1.2352030277252197, "learning_rate": 4.943700246129871e-06, "loss": 0.0297, "step": 343 }, { "epoch": 1.1061519903498191, "grad_norm": 1.2657285928726196, "learning_rate": 4.915552599930345e-06, "loss": 0.0204, "step": 344 }, { "epoch": 1.1093687173301165, "grad_norm": 0.7982221245765686, "learning_rate": 4.887407630534271e-06, "loss": 0.0306, "step": 345 }, { "epoch": 1.1125854443104142, "grad_norm": 0.7376164197921753, "learning_rate": 4.859266230077474e-06, "loss": 0.0299, "step": 346 }, { "epoch": 1.1158021712907118, "grad_norm": 0.7835693359375, "learning_rate": 4.83112929058266e-06, "loss": 0.0231, "step": 347 }, { "epoch": 1.1190188982710092, "grad_norm": 1.1944645643234253, "learning_rate": 4.802997703931124e-06, "loss": 0.0277, "step": 348 }, { "epoch": 1.1222356252513068, "grad_norm": 0.6810235977172852, "learning_rate": 4.7748723618344865e-06, "loss": 0.0261, "step": 349 }, { "epoch": 1.1254523522316044, "grad_norm": 0.6911215782165527, "learning_rate": 4.746754155806437e-06, "loss": 0.0213, "step": 350 }, { "epoch": 1.1286690792119018, "grad_norm": 0.8558758497238159, "learning_rate": 4.71864397713446e-06, "loss": 0.0238, "step": 351 }, { "epoch": 1.1318858061921995, "grad_norm": 0.9453890919685364, "learning_rate": 4.6905427168515914e-06, "loss": 0.0303, "step": 352 }, { "epoch": 1.135102533172497, "grad_norm": 0.666305661201477, "learning_rate": 4.662451265708174e-06, "loss": 0.0188, "step": 353 }, { "epoch": 1.1383192601527945, "grad_norm": 0.7106337547302246, "learning_rate": 4.63437051414362e-06, "loss": 0.0305, "step": 354 }, { "epoch": 1.1415359871330921, "grad_norm": 0.5249149203300476, "learning_rate": 4.606301352258192e-06, "loss": 0.0186, "step": 355 }, { "epoch": 1.1447527141133897, "grad_norm": 0.6891714334487915, "learning_rate": 4.5782446697847775e-06, "loss": 0.0193, "step": 356 }, { "epoch": 1.1479694410936871, "grad_norm": 1.2555615901947021, "learning_rate": 4.550201356060695e-06, "loss": 0.0274, "step": 357 }, { "epoch": 1.1511861680739848, "grad_norm": 1.113802433013916, "learning_rate": 4.52217229999951e-06, "loss": 0.0256, "step": 358 }, { "epoch": 1.1544028950542822, "grad_norm": 0.6190158724784851, "learning_rate": 4.49415839006284e-06, "loss": 0.0265, "step": 359 }, { "epoch": 1.1576196220345798, "grad_norm": 0.8673491477966309, "learning_rate": 4.466160514232206e-06, "loss": 0.0341, "step": 360 }, { "epoch": 1.1608363490148774, "grad_norm": 0.4235200583934784, "learning_rate": 4.438179559980885e-06, "loss": 0.0163, "step": 361 }, { "epoch": 1.1640530759951748, "grad_norm": 0.9669196605682373, "learning_rate": 4.410216414245771e-06, "loss": 0.0357, "step": 362 }, { "epoch": 1.1672698029754724, "grad_norm": 0.8336294293403625, "learning_rate": 4.382271963399268e-06, "loss": 0.0161, "step": 363 }, { "epoch": 1.17048652995577, "grad_norm": 0.8520212769508362, "learning_rate": 4.354347093221194e-06, "loss": 0.0206, "step": 364 }, { "epoch": 1.1737032569360675, "grad_norm": 0.9719181060791016, "learning_rate": 4.326442688870697e-06, "loss": 0.0207, "step": 365 }, { "epoch": 1.176919983916365, "grad_norm": 0.6609673500061035, "learning_rate": 4.298559634858202e-06, "loss": 0.0288, "step": 366 }, { "epoch": 1.1801367108966627, "grad_norm": 0.9857617616653442, "learning_rate": 4.270698815017379e-06, "loss": 0.0279, "step": 367 }, { "epoch": 1.1833534378769601, "grad_norm": 0.788272500038147, "learning_rate": 4.2428611124771184e-06, "loss": 0.0144, "step": 368 }, { "epoch": 1.1865701648572577, "grad_norm": 0.8060084581375122, "learning_rate": 4.2150474096335356e-06, "loss": 0.023, "step": 369 }, { "epoch": 1.1897868918375554, "grad_norm": 1.2030549049377441, "learning_rate": 4.187258588122019e-06, "loss": 0.0355, "step": 370 }, { "epoch": 1.1930036188178528, "grad_norm": 0.6166083812713623, "learning_rate": 4.15949552878926e-06, "loss": 0.0165, "step": 371 }, { "epoch": 1.1962203457981504, "grad_norm": 1.5844203233718872, "learning_rate": 4.131759111665349e-06, "loss": 0.0323, "step": 372 }, { "epoch": 1.199437072778448, "grad_norm": 0.9516481757164001, "learning_rate": 4.104050215935875e-06, "loss": 0.0306, "step": 373 }, { "epoch": 1.2026537997587454, "grad_norm": 1.0186715126037598, "learning_rate": 4.076369719914055e-06, "loss": 0.0252, "step": 374 }, { "epoch": 1.205870526739043, "grad_norm": 0.891091525554657, "learning_rate": 4.048718501012895e-06, "loss": 0.0168, "step": 375 }, { "epoch": 1.2090872537193404, "grad_norm": 0.6966618299484253, "learning_rate": 4.021097435717386e-06, "loss": 0.0297, "step": 376 }, { "epoch": 1.212303980699638, "grad_norm": 0.9956583380699158, "learning_rate": 3.993507399556699e-06, "loss": 0.0189, "step": 377 }, { "epoch": 1.2155207076799357, "grad_norm": 0.6981390714645386, "learning_rate": 3.965949267076465e-06, "loss": 0.0175, "step": 378 }, { "epoch": 1.218737434660233, "grad_norm": 0.9271097779273987, "learning_rate": 3.938423911811021e-06, "loss": 0.0215, "step": 379 }, { "epoch": 1.2219541616405307, "grad_norm": 1.0739299058914185, "learning_rate": 3.910932206255742e-06, "loss": 0.0304, "step": 380 }, { "epoch": 1.2251708886208283, "grad_norm": 0.7238112092018127, "learning_rate": 3.883475021839382e-06, "loss": 0.0269, "step": 381 }, { "epoch": 1.2283876156011257, "grad_norm": 0.8449429273605347, "learning_rate": 3.856053228896442e-06, "loss": 0.0253, "step": 382 }, { "epoch": 1.2316043425814234, "grad_norm": 1.0511232614517212, "learning_rate": 3.8286676966395895e-06, "loss": 0.0286, "step": 383 }, { "epoch": 1.234821069561721, "grad_norm": 0.6601850390434265, "learning_rate": 3.8013192931321095e-06, "loss": 0.0161, "step": 384 }, { "epoch": 1.2380377965420184, "grad_norm": 1.7054966688156128, "learning_rate": 3.77400888526038e-06, "loss": 0.0382, "step": 385 }, { "epoch": 1.241254523522316, "grad_norm": 0.5929433703422546, "learning_rate": 3.7467373387063973e-06, "loss": 0.027, "step": 386 }, { "epoch": 1.2444712505026136, "grad_norm": 0.6725859642028809, "learning_rate": 3.719505517920342e-06, "loss": 0.0283, "step": 387 }, { "epoch": 1.247687977482911, "grad_norm": 0.6051467657089233, "learning_rate": 3.692314286093167e-06, "loss": 0.024, "step": 388 }, { "epoch": 1.2509047044632087, "grad_norm": 0.7530882358551025, "learning_rate": 3.6651645051292415e-06, "loss": 0.0226, "step": 389 }, { "epoch": 1.2541214314435063, "grad_norm": 0.6657713651657104, "learning_rate": 3.6380570356190346e-06, "loss": 0.0197, "step": 390 }, { "epoch": 1.2573381584238037, "grad_norm": 1.1519615650177002, "learning_rate": 3.610992736811827e-06, "loss": 0.0188, "step": 391 }, { "epoch": 1.2605548854041013, "grad_norm": 0.8955692648887634, "learning_rate": 3.58397246658848e-06, "loss": 0.0193, "step": 392 }, { "epoch": 1.263771612384399, "grad_norm": 1.1424133777618408, "learning_rate": 3.556997081434248e-06, "loss": 0.025, "step": 393 }, { "epoch": 1.2669883393646963, "grad_norm": 0.634136974811554, "learning_rate": 3.5300674364116173e-06, "loss": 0.0237, "step": 394 }, { "epoch": 1.270205066344994, "grad_norm": 0.9286861419677734, "learning_rate": 3.5031843851332105e-06, "loss": 0.0231, "step": 395 }, { "epoch": 1.2734217933252916, "grad_norm": 0.7624120712280273, "learning_rate": 3.476348779734732e-06, "loss": 0.0246, "step": 396 }, { "epoch": 1.276638520305589, "grad_norm": 0.8140252828598022, "learning_rate": 3.449561470847947e-06, "loss": 0.0262, "step": 397 }, { "epoch": 1.2798552472858866, "grad_norm": 1.130080223083496, "learning_rate": 3.4228233075737225e-06, "loss": 0.0294, "step": 398 }, { "epoch": 1.2830719742661842, "grad_norm": 0.8101189136505127, "learning_rate": 3.3961351374551234e-06, "loss": 0.0279, "step": 399 }, { "epoch": 1.2862887012464816, "grad_norm": 0.5413293838500977, "learning_rate": 3.3694978064505258e-06, "loss": 0.0104, "step": 400 }, { "epoch": 1.2895054282267793, "grad_norm": 0.940980851650238, "learning_rate": 3.3429121589068213e-06, "loss": 0.023, "step": 401 }, { "epoch": 1.292722155207077, "grad_norm": 1.4094127416610718, "learning_rate": 3.316379037532644e-06, "loss": 0.0238, "step": 402 }, { "epoch": 1.2959388821873743, "grad_norm": 0.6321690082550049, "learning_rate": 3.289899283371657e-06, "loss": 0.0233, "step": 403 }, { "epoch": 1.299155609167672, "grad_norm": 0.7912178635597229, "learning_rate": 3.2634737357758994e-06, "loss": 0.0247, "step": 404 }, { "epoch": 1.3023723361479695, "grad_norm": 0.808509111404419, "learning_rate": 3.2371032323791757e-06, "loss": 0.0209, "step": 405 }, { "epoch": 1.305589063128267, "grad_norm": 0.5239757895469666, "learning_rate": 3.2107886090705035e-06, "loss": 0.0188, "step": 406 }, { "epoch": 1.3088057901085646, "grad_norm": 0.7358213067054749, "learning_rate": 3.1845306999676274e-06, "loss": 0.0328, "step": 407 }, { "epoch": 1.3120225170888622, "grad_norm": 0.8183391690254211, "learning_rate": 3.158330337390565e-06, "loss": 0.0263, "step": 408 }, { "epoch": 1.3152392440691596, "grad_norm": 0.7185299396514893, "learning_rate": 3.132188351835232e-06, "loss": 0.0316, "step": 409 }, { "epoch": 1.3184559710494572, "grad_norm": 0.7911690473556519, "learning_rate": 3.10610557194712e-06, "loss": 0.0229, "step": 410 }, { "epoch": 1.3216726980297548, "grad_norm": 0.5570195317268372, "learning_rate": 3.080082824495024e-06, "loss": 0.0276, "step": 411 }, { "epoch": 1.3248894250100522, "grad_norm": 0.760032057762146, "learning_rate": 3.0541209343448373e-06, "loss": 0.0228, "step": 412 }, { "epoch": 1.3281061519903499, "grad_norm": 0.6067129373550415, "learning_rate": 3.0282207244334084e-06, "loss": 0.0211, "step": 413 }, { "epoch": 1.3313228789706475, "grad_norm": 0.8982436656951904, "learning_rate": 3.0023830157424504e-06, "loss": 0.0269, "step": 414 }, { "epoch": 1.334539605950945, "grad_norm": 0.9359907507896423, "learning_rate": 2.97660862727252e-06, "loss": 0.0226, "step": 415 }, { "epoch": 1.3377563329312425, "grad_norm": 0.5233786106109619, "learning_rate": 2.950898376017064e-06, "loss": 0.0226, "step": 416 }, { "epoch": 1.3409730599115401, "grad_norm": 0.5477368235588074, "learning_rate": 2.9252530769365053e-06, "loss": 0.0177, "step": 417 }, { "epoch": 1.3441897868918375, "grad_norm": 0.7857730984687805, "learning_rate": 2.8996735429324256e-06, "loss": 0.0208, "step": 418 }, { "epoch": 1.3474065138721352, "grad_norm": 0.6837164163589478, "learning_rate": 2.874160584821798e-06, "loss": 0.0204, "step": 419 }, { "epoch": 1.3506232408524326, "grad_norm": 0.5833801627159119, "learning_rate": 2.848715011311271e-06, "loss": 0.0179, "step": 420 }, { "epoch": 1.3538399678327302, "grad_norm": 0.7275317311286926, "learning_rate": 2.823337628971551e-06, "loss": 0.0176, "step": 421 }, { "epoch": 1.3570566948130278, "grad_norm": 1.1249991655349731, "learning_rate": 2.7980292422118282e-06, "loss": 0.0264, "step": 422 }, { "epoch": 1.3602734217933252, "grad_norm": 0.8892255425453186, "learning_rate": 2.7727906532542783e-06, "loss": 0.021, "step": 423 }, { "epoch": 1.3634901487736228, "grad_norm": 0.8986218571662903, "learning_rate": 2.7476226621086354e-06, "loss": 0.0209, "step": 424 }, { "epoch": 1.3667068757539202, "grad_norm": 0.8153395056724548, "learning_rate": 2.72252606654683e-06, "loss": 0.0209, "step": 425 }, { "epoch": 1.3699236027342179, "grad_norm": 0.8225369453430176, "learning_rate": 2.697501662077707e-06, "loss": 0.0299, "step": 426 }, { "epoch": 1.3731403297145155, "grad_norm": 0.8588507175445557, "learning_rate": 2.6725502419218084e-06, "loss": 0.0346, "step": 427 }, { "epoch": 1.376357056694813, "grad_norm": 1.9007744789123535, "learning_rate": 2.6476725969862227e-06, "loss": 0.0418, "step": 428 }, { "epoch": 1.3795737836751105, "grad_norm": 1.121382474899292, "learning_rate": 2.622869515839524e-06, "loss": 0.0179, "step": 429 }, { "epoch": 1.3827905106554081, "grad_norm": 0.7519135475158691, "learning_rate": 2.5981417846867753e-06, "loss": 0.0242, "step": 430 }, { "epoch": 1.3860072376357055, "grad_norm": 0.7129932641983032, "learning_rate": 2.573490187344596e-06, "loss": 0.0137, "step": 431 }, { "epoch": 1.3892239646160032, "grad_norm": 0.7935872673988342, "learning_rate": 2.548915505216333e-06, "loss": 0.0287, "step": 432 }, { "epoch": 1.3924406915963008, "grad_norm": 1.06389582157135, "learning_rate": 2.524418517267283e-06, "loss": 0.0251, "step": 433 }, { "epoch": 1.3956574185765982, "grad_norm": 0.8240230679512024, "learning_rate": 2.5000000000000015e-06, "loss": 0.0297, "step": 434 }, { "epoch": 1.3988741455568958, "grad_norm": 0.9097860455513, "learning_rate": 2.4756607274296844e-06, "loss": 0.0236, "step": 435 }, { "epoch": 1.4020908725371934, "grad_norm": 0.634899377822876, "learning_rate": 2.4514014710596467e-06, "loss": 0.0205, "step": 436 }, { "epoch": 1.4053075995174908, "grad_norm": 1.0875910520553589, "learning_rate": 2.4272229998568576e-06, "loss": 0.0367, "step": 437 }, { "epoch": 1.4085243264977885, "grad_norm": 1.6455551385879517, "learning_rate": 2.4031260802275623e-06, "loss": 0.0271, "step": 438 }, { "epoch": 1.411741053478086, "grad_norm": 0.7204722166061401, "learning_rate": 2.3791114759930013e-06, "loss": 0.0188, "step": 439 }, { "epoch": 1.4149577804583835, "grad_norm": 1.0633268356323242, "learning_rate": 2.3551799483651894e-06, "loss": 0.0264, "step": 440 }, { "epoch": 1.4181745074386811, "grad_norm": 0.6467992663383484, "learning_rate": 2.331332255922784e-06, "loss": 0.0184, "step": 441 }, { "epoch": 1.4213912344189787, "grad_norm": 0.5554575324058533, "learning_rate": 2.307569154587056e-06, "loss": 0.0184, "step": 442 }, { "epoch": 1.4246079613992761, "grad_norm": 0.6711392998695374, "learning_rate": 2.283891397597908e-06, "loss": 0.0206, "step": 443 }, { "epoch": 1.4278246883795738, "grad_norm": 0.7576943635940552, "learning_rate": 2.2602997354900075e-06, "loss": 0.031, "step": 444 }, { "epoch": 1.4310414153598714, "grad_norm": 0.6453703045845032, "learning_rate": 2.236794916069007e-06, "loss": 0.0171, "step": 445 }, { "epoch": 1.4342581423401688, "grad_norm": 0.5706139802932739, "learning_rate": 2.2133776843878185e-06, "loss": 0.0191, "step": 446 }, { "epoch": 1.4374748693204664, "grad_norm": 0.7611203789710999, "learning_rate": 2.190048782723015e-06, "loss": 0.0225, "step": 447 }, { "epoch": 1.440691596300764, "grad_norm": 0.6822800040245056, "learning_rate": 2.166808950551296e-06, "loss": 0.0256, "step": 448 }, { "epoch": 1.4439083232810614, "grad_norm": 0.798964262008667, "learning_rate": 2.1436589245260375e-06, "loss": 0.0236, "step": 449 }, { "epoch": 1.447125050261359, "grad_norm": 0.8539571166038513, "learning_rate": 2.120599438453968e-06, "loss": 0.0241, "step": 450 }, { "epoch": 1.4503417772416567, "grad_norm": 0.8464952707290649, "learning_rate": 2.0976312232718763e-06, "loss": 0.0282, "step": 451 }, { "epoch": 1.453558504221954, "grad_norm": 0.7654972076416016, "learning_rate": 2.074755007023461e-06, "loss": 0.0278, "step": 452 }, { "epoch": 1.4567752312022517, "grad_norm": 0.9112337231636047, "learning_rate": 2.0519715148362585e-06, "loss": 0.0176, "step": 453 }, { "epoch": 1.4599919581825493, "grad_norm": 0.6856303215026855, "learning_rate": 2.0292814688986375e-06, "loss": 0.0224, "step": 454 }, { "epoch": 1.4632086851628467, "grad_norm": 0.9206861257553101, "learning_rate": 2.0066855884369246e-06, "loss": 0.0312, "step": 455 }, { "epoch": 1.4664254121431444, "grad_norm": 0.9237130880355835, "learning_rate": 1.9841845896926022e-06, "loss": 0.0256, "step": 456 }, { "epoch": 1.469642139123442, "grad_norm": 0.5882164835929871, "learning_rate": 1.961779185899597e-06, "loss": 0.0224, "step": 457 }, { "epoch": 1.4728588661037394, "grad_norm": 0.5949448347091675, "learning_rate": 1.9394700872616856e-06, "loss": 0.0197, "step": 458 }, { "epoch": 1.476075593084037, "grad_norm": 0.5350119471549988, "learning_rate": 1.9172580009299735e-06, "loss": 0.0164, "step": 459 }, { "epoch": 1.4792923200643346, "grad_norm": 1.140408992767334, "learning_rate": 1.8951436309804766e-06, "loss": 0.0344, "step": 460 }, { "epoch": 1.482509047044632, "grad_norm": 0.7622132301330566, "learning_rate": 1.8731276783918162e-06, "loss": 0.0195, "step": 461 }, { "epoch": 1.4857257740249297, "grad_norm": 1.5069661140441895, "learning_rate": 1.8512108410229878e-06, "loss": 0.0306, "step": 462 }, { "epoch": 1.4889425010052273, "grad_norm": 0.7332004904747009, "learning_rate": 1.8293938135912475e-06, "loss": 0.0217, "step": 463 }, { "epoch": 1.4921592279855247, "grad_norm": 0.7239487767219543, "learning_rate": 1.8076772876500831e-06, "loss": 0.0298, "step": 464 }, { "epoch": 1.4953759549658223, "grad_norm": 0.8309620022773743, "learning_rate": 1.7860619515673034e-06, "loss": 0.0316, "step": 465 }, { "epoch": 1.49859268194612, "grad_norm": 0.6418971419334412, "learning_rate": 1.7645484905032129e-06, "loss": 0.0182, "step": 466 }, { "epoch": 1.5018094089264173, "grad_norm": 0.5187109112739563, "learning_rate": 1.74313758638889e-06, "loss": 0.025, "step": 467 }, { "epoch": 1.505026135906715, "grad_norm": 0.9927024245262146, "learning_rate": 1.7218299179045789e-06, "loss": 0.0256, "step": 468 }, { "epoch": 1.5082428628870126, "grad_norm": 0.6816557049751282, "learning_rate": 1.7006261604581725e-06, "loss": 0.0144, "step": 469 }, { "epoch": 1.51145958986731, "grad_norm": 0.6767432689666748, "learning_rate": 1.6795269861638041e-06, "loss": 0.0256, "step": 470 }, { "epoch": 1.5146763168476074, "grad_norm": 0.8080036044120789, "learning_rate": 1.6585330638205454e-06, "loss": 0.0283, "step": 471 }, { "epoch": 1.5178930438279052, "grad_norm": 0.7607430219650269, "learning_rate": 1.6376450588911985e-06, "loss": 0.0223, "step": 472 }, { "epoch": 1.5211097708082026, "grad_norm": 0.4729120433330536, "learning_rate": 1.6168636334812126e-06, "loss": 0.0142, "step": 473 }, { "epoch": 1.5243264977885, "grad_norm": 0.5791823267936707, "learning_rate": 1.5961894463176942e-06, "loss": 0.0258, "step": 474 }, { "epoch": 1.527543224768798, "grad_norm": 1.1532747745513916, "learning_rate": 1.5756231527285181e-06, "loss": 0.027, "step": 475 }, { "epoch": 1.5307599517490953, "grad_norm": 0.7709406018257141, "learning_rate": 1.555165404621567e-06, "loss": 0.0273, "step": 476 }, { "epoch": 1.5339766787293927, "grad_norm": 1.0912410020828247, "learning_rate": 1.5348168504640631e-06, "loss": 0.0282, "step": 477 }, { "epoch": 1.5371934057096905, "grad_norm": 0.7054423689842224, "learning_rate": 1.5145781352620054e-06, "loss": 0.0273, "step": 478 }, { "epoch": 1.540410132689988, "grad_norm": 0.46668004989624023, "learning_rate": 1.4944499005397372e-06, "loss": 0.0164, "step": 479 }, { "epoch": 1.5436268596702853, "grad_norm": 0.7803506851196289, "learning_rate": 1.4744327843196043e-06, "loss": 0.0283, "step": 480 }, { "epoch": 1.5468435866505832, "grad_norm": 0.7836430668830872, "learning_rate": 1.4545274211017264e-06, "loss": 0.0278, "step": 481 }, { "epoch": 1.5500603136308806, "grad_norm": 1.152942419052124, "learning_rate": 1.434734441843899e-06, "loss": 0.0273, "step": 482 }, { "epoch": 1.553277040611178, "grad_norm": 0.7189183831214905, "learning_rate": 1.4150544739415755e-06, "loss": 0.0238, "step": 483 }, { "epoch": 1.5564937675914756, "grad_norm": 1.2334411144256592, "learning_rate": 1.3954881412079945e-06, "loss": 0.0296, "step": 484 }, { "epoch": 1.5597104945717732, "grad_norm": 1.2712557315826416, "learning_rate": 1.3760360638544012e-06, "loss": 0.0258, "step": 485 }, { "epoch": 1.5629272215520706, "grad_norm": 0.6514741778373718, "learning_rate": 1.3566988584703817e-06, "loss": 0.0256, "step": 486 }, { "epoch": 1.5661439485323683, "grad_norm": 0.7159017324447632, "learning_rate": 1.3374771380043306e-06, "loss": 0.0221, "step": 487 }, { "epoch": 1.569360675512666, "grad_norm": 0.7216795086860657, "learning_rate": 1.3183715117440143e-06, "loss": 0.0293, "step": 488 }, { "epoch": 1.5725774024929633, "grad_norm": 0.5431943535804749, "learning_rate": 1.2993825852972559e-06, "loss": 0.0152, "step": 489 }, { "epoch": 1.575794129473261, "grad_norm": 1.1157512664794922, "learning_rate": 1.280510960572745e-06, "loss": 0.0297, "step": 490 }, { "epoch": 1.5790108564535585, "grad_norm": 0.8432955741882324, "learning_rate": 1.2617572357609565e-06, "loss": 0.023, "step": 491 }, { "epoch": 1.582227583433856, "grad_norm": 0.8817299008369446, "learning_rate": 1.2431220053151832e-06, "loss": 0.0276, "step": 492 }, { "epoch": 1.5854443104141536, "grad_norm": 0.6438712477684021, "learning_rate": 1.2246058599327021e-06, "loss": 0.0197, "step": 493 }, { "epoch": 1.5886610373944512, "grad_norm": 0.947399377822876, "learning_rate": 1.2062093865360458e-06, "loss": 0.0266, "step": 494 }, { "epoch": 1.5918777643747486, "grad_norm": 0.8782487511634827, "learning_rate": 1.1879331682543972e-06, "loss": 0.0146, "step": 495 }, { "epoch": 1.5950944913550462, "grad_norm": 0.676629364490509, "learning_rate": 1.1697777844051105e-06, "loss": 0.0266, "step": 496 }, { "epoch": 1.5983112183353438, "grad_norm": 1.0170975923538208, "learning_rate": 1.1517438104753386e-06, "loss": 0.0278, "step": 497 }, { "epoch": 1.6015279453156412, "grad_norm": 0.8391772508621216, "learning_rate": 1.1338318181038037e-06, "loss": 0.0264, "step": 498 }, { "epoch": 1.6047446722959389, "grad_norm": 0.6024744510650635, "learning_rate": 1.1160423750626693e-06, "loss": 0.0297, "step": 499 }, { "epoch": 1.6079613992762365, "grad_norm": 0.5469645261764526, "learning_rate": 1.0983760452395415e-06, "loss": 0.0181, "step": 500 }, { "epoch": 1.611178126256534, "grad_norm": 0.6368651986122131, "learning_rate": 1.0808333886196038e-06, "loss": 0.0203, "step": 501 }, { "epoch": 1.6143948532368315, "grad_norm": 0.6552625894546509, "learning_rate": 1.063414961267859e-06, "loss": 0.0243, "step": 502 }, { "epoch": 1.6176115802171291, "grad_norm": 0.5399875640869141, "learning_rate": 1.046121315311508e-06, "loss": 0.0201, "step": 503 }, { "epoch": 1.6208283071974265, "grad_norm": 0.7226431369781494, "learning_rate": 1.02895299892244e-06, "loss": 0.0269, "step": 504 }, { "epoch": 1.6240450341777242, "grad_norm": 0.7085054516792297, "learning_rate": 1.01191055629987e-06, "loss": 0.0253, "step": 505 }, { "epoch": 1.6272617611580218, "grad_norm": 0.7027333974838257, "learning_rate": 9.949945276530782e-07, "loss": 0.0202, "step": 506 }, { "epoch": 1.6304784881383192, "grad_norm": 0.669688880443573, "learning_rate": 9.782054491842879e-07, "loss": 0.0184, "step": 507 }, { "epoch": 1.6336952151186168, "grad_norm": 0.7410572171211243, "learning_rate": 9.615438530716753e-07, "loss": 0.0144, "step": 508 }, { "epoch": 1.6369119420989144, "grad_norm": 0.8965449333190918, "learning_rate": 9.450102674524952e-07, "loss": 0.0206, "step": 509 }, { "epoch": 1.6401286690792118, "grad_norm": 0.8675963878631592, "learning_rate": 9.286052164063369e-07, "loss": 0.0267, "step": 510 }, { "epoch": 1.6433453960595095, "grad_norm": 0.9799548387527466, "learning_rate": 9.123292199385247e-07, "loss": 0.0334, "step": 511 }, { "epoch": 1.646562123039807, "grad_norm": 0.5996445417404175, "learning_rate": 8.961827939636198e-07, "loss": 0.023, "step": 512 }, { "epoch": 1.6497788500201045, "grad_norm": 0.6122687458992004, "learning_rate": 8.801664502890722e-07, "loss": 0.0214, "step": 513 }, { "epoch": 1.6529955770004021, "grad_norm": 0.6234684586524963, "learning_rate": 8.64280696599008e-07, "loss": 0.0215, "step": 514 }, { "epoch": 1.6562123039806997, "grad_norm": 0.5246209502220154, "learning_rate": 8.485260364381187e-07, "loss": 0.0186, "step": 515 }, { "epoch": 1.6594290309609971, "grad_norm": 1.306154489517212, "learning_rate": 8.329029691957124e-07, "loss": 0.0257, "step": 516 }, { "epoch": 1.6626457579412948, "grad_norm": 0.8695035576820374, "learning_rate": 8.17411990089883e-07, "loss": 0.0389, "step": 517 }, { "epoch": 1.6658624849215924, "grad_norm": 0.5970683693885803, "learning_rate": 8.02053590151805e-07, "loss": 0.0281, "step": 518 }, { "epoch": 1.6690792119018898, "grad_norm": 0.7056528329849243, "learning_rate": 7.868282562101819e-07, "loss": 0.0212, "step": 519 }, { "epoch": 1.6722959388821874, "grad_norm": 0.8744556307792664, "learning_rate": 7.717364708758024e-07, "loss": 0.0265, "step": 520 }, { "epoch": 1.675512665862485, "grad_norm": 0.8215451240539551, "learning_rate": 7.567787125262449e-07, "loss": 0.0197, "step": 521 }, { "epoch": 1.6787293928427824, "grad_norm": 1.1487066745758057, "learning_rate": 7.41955455290726e-07, "loss": 0.03, "step": 522 }, { "epoch": 1.68194611982308, "grad_norm": 0.7303391098976135, "learning_rate": 7.27267169035053e-07, "loss": 0.0253, "step": 523 }, { "epoch": 1.6851628468033777, "grad_norm": 0.5748582482337952, "learning_rate": 7.127143193467445e-07, "loss": 0.0209, "step": 524 }, { "epoch": 1.688379573783675, "grad_norm": 1.0102810859680176, "learning_rate": 6.982973675202676e-07, "loss": 0.0263, "step": 525 }, { "epoch": 1.6915963007639725, "grad_norm": 0.7919561862945557, "learning_rate": 6.840167705424106e-07, "loss": 0.0268, "step": 526 }, { "epoch": 1.6948130277442703, "grad_norm": 0.792404294013977, "learning_rate": 6.698729810778065e-07, "loss": 0.0158, "step": 527 }, { "epoch": 1.6980297547245677, "grad_norm": 1.081179141998291, "learning_rate": 6.558664474545817e-07, "loss": 0.0312, "step": 528 }, { "epoch": 1.7012464817048651, "grad_norm": 0.6410866975784302, "learning_rate": 6.419976136501377e-07, "loss": 0.0188, "step": 529 }, { "epoch": 1.704463208685163, "grad_norm": 0.5669673085212708, "learning_rate": 6.282669192770896e-07, "loss": 0.0164, "step": 530 }, { "epoch": 1.7076799356654604, "grad_norm": 0.6292052268981934, "learning_rate": 6.146747995693225e-07, "loss": 0.0251, "step": 531 }, { "epoch": 1.7108966626457578, "grad_norm": 0.8831068277359009, "learning_rate": 6.012216853682001e-07, "loss": 0.0271, "step": 532 }, { "epoch": 1.7141133896260556, "grad_norm": 0.8742364048957825, "learning_rate": 5.879080031089047e-07, "loss": 0.0297, "step": 533 }, { "epoch": 1.717330116606353, "grad_norm": 0.6837791800498962, "learning_rate": 5.747341748069229e-07, "loss": 0.0184, "step": 534 }, { "epoch": 1.7205468435866504, "grad_norm": 0.7482956647872925, "learning_rate": 5.617006180446688e-07, "loss": 0.0237, "step": 535 }, { "epoch": 1.7237635705669483, "grad_norm": 0.5687646269798279, "learning_rate": 5.488077459582425e-07, "loss": 0.0219, "step": 536 }, { "epoch": 1.7269802975472457, "grad_norm": 0.5424864292144775, "learning_rate": 5.360559672243421e-07, "loss": 0.0228, "step": 537 }, { "epoch": 1.730197024527543, "grad_norm": 0.6900858879089355, "learning_rate": 5.234456860473042e-07, "loss": 0.0227, "step": 538 }, { "epoch": 1.7334137515078407, "grad_norm": 0.5527268648147583, "learning_rate": 5.109773021462921e-07, "loss": 0.0156, "step": 539 }, { "epoch": 1.7366304784881383, "grad_norm": 0.7617378234863281, "learning_rate": 4.986512107426283e-07, "loss": 0.0308, "step": 540 }, { "epoch": 1.7398472054684357, "grad_norm": 0.8059920072555542, "learning_rate": 4.864678025472635e-07, "loss": 0.0119, "step": 541 }, { "epoch": 1.7430639324487334, "grad_norm": 0.8831843137741089, "learning_rate": 4.7442746374839363e-07, "loss": 0.0267, "step": 542 }, { "epoch": 1.746280659429031, "grad_norm": 0.5986456274986267, "learning_rate": 4.625305759992205e-07, "loss": 0.0185, "step": 543 }, { "epoch": 1.7494973864093284, "grad_norm": 0.9453976154327393, "learning_rate": 4.50777516405847e-07, "loss": 0.0244, "step": 544 }, { "epoch": 1.752714113389626, "grad_norm": 0.7596896290779114, "learning_rate": 4.3916865751533313e-07, "loss": 0.024, "step": 545 }, { "epoch": 1.7559308403699236, "grad_norm": 0.9080878496170044, "learning_rate": 4.2770436730388166e-07, "loss": 0.0281, "step": 546 }, { "epoch": 1.759147567350221, "grad_norm": 1.2852977514266968, "learning_rate": 4.163850091651717e-07, "loss": 0.0345, "step": 547 }, { "epoch": 1.7623642943305187, "grad_norm": 0.9868035912513733, "learning_rate": 4.05210941898847e-07, "loss": 0.0203, "step": 548 }, { "epoch": 1.7655810213108163, "grad_norm": 0.749136745929718, "learning_rate": 3.941825196991378e-07, "loss": 0.0249, "step": 549 }, { "epoch": 1.7687977482911137, "grad_norm": 0.9028546810150146, "learning_rate": 3.8330009214363197e-07, "loss": 0.0218, "step": 550 }, { "epoch": 1.7720144752714113, "grad_norm": 0.8566029667854309, "learning_rate": 3.725640041822026e-07, "loss": 0.0251, "step": 551 }, { "epoch": 1.775231202251709, "grad_norm": 0.6764525771141052, "learning_rate": 3.619745961260623e-07, "loss": 0.0255, "step": 552 }, { "epoch": 1.7784479292320063, "grad_norm": 0.8676577210426331, "learning_rate": 3.5153220363698225e-07, "loss": 0.0233, "step": 553 }, { "epoch": 1.781664656212304, "grad_norm": 0.7067648768424988, "learning_rate": 3.4123715771665786e-07, "loss": 0.0211, "step": 554 }, { "epoch": 1.7848813831926016, "grad_norm": 0.6299604773521423, "learning_rate": 3.310897846962041e-07, "loss": 0.0158, "step": 555 }, { "epoch": 1.788098110172899, "grad_norm": 0.5761677026748657, "learning_rate": 3.2109040622582186e-07, "loss": 0.0129, "step": 556 }, { "epoch": 1.7913148371531966, "grad_norm": 0.7623018622398376, "learning_rate": 3.112393392645985e-07, "loss": 0.0241, "step": 557 }, { "epoch": 1.7945315641334942, "grad_norm": 0.8402174711227417, "learning_rate": 3.015368960704584e-07, "loss": 0.026, "step": 558 }, { "epoch": 1.7977482911137916, "grad_norm": 0.7599536776542664, "learning_rate": 2.919833841902714e-07, "loss": 0.0229, "step": 559 }, { "epoch": 1.8009650180940893, "grad_norm": 0.5555005073547363, "learning_rate": 2.8257910645009935e-07, "loss": 0.025, "step": 560 }, { "epoch": 1.8041817450743869, "grad_norm": 0.9722334742546082, "learning_rate": 2.733243609455971e-07, "loss": 0.0248, "step": 561 }, { "epoch": 1.8073984720546843, "grad_norm": 0.47031423449516296, "learning_rate": 2.6421944103256657e-07, "loss": 0.0118, "step": 562 }, { "epoch": 1.810615199034982, "grad_norm": 0.6891758441925049, "learning_rate": 2.5526463531765467e-07, "loss": 0.0272, "step": 563 }, { "epoch": 1.8138319260152795, "grad_norm": 0.5683274269104004, "learning_rate": 2.4646022764920843e-07, "loss": 0.0181, "step": 564 }, { "epoch": 1.817048652995577, "grad_norm": 0.5917408466339111, "learning_rate": 2.3780649710827552e-07, "loss": 0.0223, "step": 565 }, { "epoch": 1.8202653799758746, "grad_norm": 0.6025788187980652, "learning_rate": 2.2930371799975593e-07, "loss": 0.0209, "step": 566 }, { "epoch": 1.8234821069561722, "grad_norm": 0.6595741510391235, "learning_rate": 2.20952159843712e-07, "loss": 0.0197, "step": 567 }, { "epoch": 1.8266988339364696, "grad_norm": 0.5931892395019531, "learning_rate": 2.1275208736682262e-07, "loss": 0.0149, "step": 568 }, { "epoch": 1.8299155609167672, "grad_norm": 0.45539724826812744, "learning_rate": 2.0470376049398944e-07, "loss": 0.0132, "step": 569 }, { "epoch": 1.8331322878970648, "grad_norm": 0.7427828311920166, "learning_rate": 1.9680743434010385e-07, "loss": 0.0201, "step": 570 }, { "epoch": 1.8363490148773622, "grad_norm": 0.7957307696342468, "learning_rate": 1.8906335920195418e-07, "loss": 0.0172, "step": 571 }, { "epoch": 1.8395657418576599, "grad_norm": 0.6128239035606384, "learning_rate": 1.814717805502958e-07, "loss": 0.023, "step": 572 }, { "epoch": 1.8427824688379575, "grad_norm": 0.6324180364608765, "learning_rate": 1.7403293902206851e-07, "loss": 0.0231, "step": 573 }, { "epoch": 1.845999195818255, "grad_norm": 0.781023383140564, "learning_rate": 1.667470704127694e-07, "loss": 0.0323, "step": 574 }, { "epoch": 1.8492159227985525, "grad_norm": 0.5555590391159058, "learning_rate": 1.5961440566897913e-07, "loss": 0.0237, "step": 575 }, { "epoch": 1.8524326497788501, "grad_norm": 0.8592231869697571, "learning_rate": 1.5263517088103862e-07, "loss": 0.0232, "step": 576 }, { "epoch": 1.8556493767591475, "grad_norm": 0.6279895901679993, "learning_rate": 1.4580958727588746e-07, "loss": 0.0154, "step": 577 }, { "epoch": 1.8588661037394452, "grad_norm": 0.7780999541282654, "learning_rate": 1.3913787121004717e-07, "loss": 0.0255, "step": 578 }, { "epoch": 1.8620828307197428, "grad_norm": 0.9826112985610962, "learning_rate": 1.3262023416276414e-07, "loss": 0.0293, "step": 579 }, { "epoch": 1.8652995577000402, "grad_norm": 0.7337117791175842, "learning_rate": 1.2625688272930925e-07, "loss": 0.0244, "step": 580 }, { "epoch": 1.8685162846803376, "grad_norm": 0.7108997106552124, "learning_rate": 1.2004801861442373e-07, "loss": 0.0227, "step": 581 }, { "epoch": 1.8717330116606354, "grad_norm": 0.6001144647598267, "learning_rate": 1.1399383862592928e-07, "loss": 0.0242, "step": 582 }, { "epoch": 1.8749497386409328, "grad_norm": 1.1188668012619019, "learning_rate": 1.0809453466849029e-07, "loss": 0.0277, "step": 583 }, { "epoch": 1.8781664656212302, "grad_norm": 0.7146681547164917, "learning_rate": 1.0235029373752758e-07, "loss": 0.0218, "step": 584 }, { "epoch": 1.881383192601528, "grad_norm": 0.6120503544807434, "learning_rate": 9.676129791329481e-08, "loss": 0.0232, "step": 585 }, { "epoch": 1.8845999195818255, "grad_norm": 0.69977205991745, "learning_rate": 9.132772435510362e-08, "loss": 0.0255, "step": 586 }, { "epoch": 1.887816646562123, "grad_norm": 0.9345167279243469, "learning_rate": 8.604974529571042e-08, "loss": 0.0368, "step": 587 }, { "epoch": 1.8910333735424207, "grad_norm": 0.7977281212806702, "learning_rate": 8.092752803585513e-08, "loss": 0.0234, "step": 588 }, { "epoch": 1.8942501005227181, "grad_norm": 0.7188998460769653, "learning_rate": 7.59612349389599e-08, "loss": 0.0222, "step": 589 }, { "epoch": 1.8974668275030155, "grad_norm": 1.3486685752868652, "learning_rate": 7.115102342598101e-08, "loss": 0.0347, "step": 590 }, { "epoch": 1.9006835544833134, "grad_norm": 0.7500903010368347, "learning_rate": 6.649704597042061e-08, "loss": 0.022, "step": 591 }, { "epoch": 1.9039002814636108, "grad_norm": 0.5467425584793091, "learning_rate": 6.199945009349173e-08, "loss": 0.0195, "step": 592 }, { "epoch": 1.9071170084439082, "grad_norm": 0.7178412079811096, "learning_rate": 5.7658378359443104e-08, "loss": 0.0307, "step": 593 }, { "epoch": 1.910333735424206, "grad_norm": 0.8701561093330383, "learning_rate": 5.3473968371040575e-08, "loss": 0.0206, "step": 594 }, { "epoch": 1.9135504624045034, "grad_norm": 0.7027232050895691, "learning_rate": 4.944635276520393e-08, "loss": 0.0231, "step": 595 }, { "epoch": 1.9167671893848008, "grad_norm": 0.711676299571991, "learning_rate": 4.55756592088058e-08, "loss": 0.0219, "step": 596 }, { "epoch": 1.9199839163650985, "grad_norm": 0.8163571357727051, "learning_rate": 4.186201039462046e-08, "loss": 0.0243, "step": 597 }, { "epoch": 1.923200643345396, "grad_norm": 0.781928539276123, "learning_rate": 3.8305524037438035e-08, "loss": 0.0269, "step": 598 }, { "epoch": 1.9264173703256935, "grad_norm": 0.7710465788841248, "learning_rate": 3.4906312870331973e-08, "loss": 0.0172, "step": 599 }, { "epoch": 1.9296340973059911, "grad_norm": 0.9015802145004272, "learning_rate": 3.166448464108629e-08, "loss": 0.0302, "step": 600 }, { "epoch": 1.9328508242862887, "grad_norm": 0.857291042804718, "learning_rate": 2.8580142108778354e-08, "loss": 0.0182, "step": 601 }, { "epoch": 1.9360675512665861, "grad_norm": 0.8512669801712036, "learning_rate": 2.5653383040524228e-08, "loss": 0.0299, "step": 602 }, { "epoch": 1.9392842782468838, "grad_norm": 0.8255121111869812, "learning_rate": 2.2884300208378395e-08, "loss": 0.0278, "step": 603 }, { "epoch": 1.9425010052271814, "grad_norm": 0.5912429690361023, "learning_rate": 2.0272981386393332e-08, "loss": 0.0207, "step": 604 }, { "epoch": 1.9457177322074788, "grad_norm": 0.6523793935775757, "learning_rate": 1.781950934783505e-08, "loss": 0.0177, "step": 605 }, { "epoch": 1.9489344591877764, "grad_norm": 0.7320718169212341, "learning_rate": 1.552396186256411e-08, "loss": 0.0255, "step": 606 }, { "epoch": 1.952151186168074, "grad_norm": 0.6994187235832214, "learning_rate": 1.3386411694565894e-08, "loss": 0.0278, "step": 607 }, { "epoch": 1.9553679131483714, "grad_norm": 0.5923306941986084, "learning_rate": 1.1406926599646373e-08, "loss": 0.0192, "step": 608 }, { "epoch": 1.958584640128669, "grad_norm": 0.7130267024040222, "learning_rate": 9.585569323284915e-09, "loss": 0.0265, "step": 609 }, { "epoch": 1.9618013671089667, "grad_norm": 0.9426466226577759, "learning_rate": 7.922397598642551e-09, "loss": 0.0357, "step": 610 }, { "epoch": 1.965018094089264, "grad_norm": 0.6841176748275757, "learning_rate": 6.417464144736208e-09, "loss": 0.0232, "step": 611 }, { "epoch": 1.9682348210695617, "grad_norm": 0.6261034607887268, "learning_rate": 5.0708166647628345e-09, "loss": 0.0205, "step": 612 }, { "epoch": 1.9714515480498593, "grad_norm": 0.7374976873397827, "learning_rate": 3.88249784459227e-09, "loss": 0.0193, "step": 613 }, { "epoch": 1.9746682750301567, "grad_norm": 0.676289975643158, "learning_rate": 2.8525453514099966e-09, "loss": 0.0177, "step": 614 }, { "epoch": 1.9778850020104544, "grad_norm": 0.8624617457389832, "learning_rate": 1.980991832524759e-09, "loss": 0.0385, "step": 615 }, { "epoch": 1.981101728990752, "grad_norm": 0.6072701811790466, "learning_rate": 1.2678649143349485e-09, "loss": 0.0219, "step": 616 }, { "epoch": 1.9843184559710494, "grad_norm": 0.620798647403717, "learning_rate": 7.131872014509711e-10, "loss": 0.0211, "step": 617 }, { "epoch": 1.987535182951347, "grad_norm": 0.5879101753234863, "learning_rate": 3.1697627597970794e-10, "loss": 0.0169, "step": 618 }, { "epoch": 1.9907519099316446, "grad_norm": 1.4460906982421875, "learning_rate": 7.924469696718451e-11, "loss": 0.0259, "step": 619 }, { "epoch": 1.993968636911942, "grad_norm": 0.9200376868247986, "learning_rate": 0.0, "loss": 0.0252, "step": 620 }, { "epoch": 1.993968636911942, "step": 620, "total_flos": 7.073469349725471e+17, "train_loss": 0.04489339354387935, "train_runtime": 2537.7411, "train_samples_per_second": 125.415, "train_steps_per_second": 0.244 } ], "logging_steps": 1.0, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.073469349725471e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }