{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 1.08417234214877, "learning_rate": 0.0, "loss": 0.5767, "step": 1 }, { "epoch": 0.032, "grad_norm": 1.0469008257052523, "learning_rate": 1.6666666666666667e-06, "loss": 0.5511, "step": 2 }, { "epoch": 0.048, "grad_norm": 0.983282752070103, "learning_rate": 3.3333333333333333e-06, "loss": 0.4992, "step": 3 }, { "epoch": 0.064, "grad_norm": 0.872051375279908, "learning_rate": 5e-06, "loss": 0.4061, "step": 4 }, { "epoch": 0.08, "grad_norm": 0.893228824528603, "learning_rate": 6.666666666666667e-06, "loss": 0.4181, "step": 5 }, { "epoch": 0.096, "grad_norm": 0.9220468625068179, "learning_rate": 8.333333333333334e-06, "loss": 0.4488, "step": 6 }, { "epoch": 0.112, "grad_norm": 0.7934650361234915, "learning_rate": 1e-05, "loss": 0.412, "step": 7 }, { "epoch": 0.128, "grad_norm": 0.5840477320156379, "learning_rate": 9.999336914672622e-06, "loss": 0.417, "step": 8 }, { "epoch": 0.144, "grad_norm": 0.505453502147165, "learning_rate": 9.997347854104775e-06, "loss": 0.5068, "step": 9 }, { "epoch": 0.16, "grad_norm": 0.5244034547886003, "learning_rate": 9.994033404481737e-06, "loss": 0.5052, "step": 10 }, { "epoch": 0.176, "grad_norm": 0.43986559011579357, "learning_rate": 9.98939454258703e-06, "loss": 0.498, "step": 11 }, { "epoch": 0.192, "grad_norm": 0.6155944923795194, "learning_rate": 9.98343263551454e-06, "loss": 0.4757, "step": 12 }, { "epoch": 0.208, "grad_norm": 0.6010488091076704, "learning_rate": 9.97614944026565e-06, "loss": 0.4733, "step": 13 }, { "epoch": 0.224, "grad_norm": 0.6021634931994316, "learning_rate": 9.967547103231432e-06, "loss": 0.4258, "step": 14 }, { "epoch": 0.24, "grad_norm": 0.6482124300684173, "learning_rate": 9.957628159560088e-06, "loss": 0.5263, "step": 15 }, { "epoch": 0.256, "grad_norm": 0.554249479130026, "learning_rate": 9.946395532409847e-06, "loss": 0.4806, "step": 16 }, { "epoch": 0.272, "grad_norm": 0.5973792849680928, "learning_rate": 9.933852532087492e-06, "loss": 0.4736, "step": 17 }, { "epoch": 0.288, "grad_norm": 0.41453038423395644, "learning_rate": 9.920002855072784e-06, "loss": 0.4289, "step": 18 }, { "epoch": 0.304, "grad_norm": 0.4620366674199325, "learning_rate": 9.904850582929112e-06, "loss": 0.4864, "step": 19 }, { "epoch": 0.32, "grad_norm": 0.35504897921565515, "learning_rate": 9.888400181100621e-06, "loss": 0.4219, "step": 20 }, { "epoch": 0.336, "grad_norm": 0.31658139671010693, "learning_rate": 9.870656497596242e-06, "loss": 0.3972, "step": 21 }, { "epoch": 0.352, "grad_norm": 0.3650004342330343, "learning_rate": 9.851624761560943e-06, "loss": 0.5571, "step": 22 }, { "epoch": 0.368, "grad_norm": 0.3257963763381936, "learning_rate": 9.831310581734687e-06, "loss": 0.4769, "step": 23 }, { "epoch": 0.384, "grad_norm": 0.325775920332389, "learning_rate": 9.809719944799512e-06, "loss": 0.4961, "step": 24 }, { "epoch": 0.4, "grad_norm": 0.31671335015470503, "learning_rate": 9.786859213615222e-06, "loss": 0.4357, "step": 25 }, { "epoch": 0.416, "grad_norm": 0.3281761016809856, "learning_rate": 9.762735125344227e-06, "loss": 0.4502, "step": 26 }, { "epoch": 0.432, "grad_norm": 0.3554086729723955, "learning_rate": 9.737354789466068e-06, "loss": 0.4768, "step": 27 }, { "epoch": 0.448, "grad_norm": 0.3865446724293106, "learning_rate": 9.710725685682222e-06, "loss": 0.4515, "step": 28 }, { "epoch": 0.464, "grad_norm": 0.37003410749958987, "learning_rate": 9.682855661711803e-06, "loss": 0.4432, "step": 29 }, { "epoch": 0.48, "grad_norm": 0.31180238962223594, "learning_rate": 9.653752930978794e-06, "loss": 0.463, "step": 30 }, { "epoch": 0.496, "grad_norm": 0.3252391994136198, "learning_rate": 9.623426070191521e-06, "loss": 0.4087, "step": 31 }, { "epoch": 0.512, "grad_norm": 0.31056365314104256, "learning_rate": 9.591884016815063e-06, "loss": 0.4254, "step": 32 }, { "epoch": 0.528, "grad_norm": 0.32735595897929803, "learning_rate": 9.559136066437319e-06, "loss": 0.5447, "step": 33 }, { "epoch": 0.544, "grad_norm": 0.3100995737340131, "learning_rate": 9.52519187002958e-06, "loss": 0.4321, "step": 34 }, { "epoch": 0.56, "grad_norm": 0.2936011999974949, "learning_rate": 9.49006143110233e-06, "loss": 0.4156, "step": 35 }, { "epoch": 0.576, "grad_norm": 0.29737459640312214, "learning_rate": 9.453755102757168e-06, "loss": 0.493, "step": 36 }, { "epoch": 0.592, "grad_norm": 0.2728926853035778, "learning_rate": 9.4162835846357e-06, "loss": 0.4392, "step": 37 }, { "epoch": 0.608, "grad_norm": 0.3272565049177079, "learning_rate": 9.377657919766307e-06, "loss": 0.491, "step": 38 }, { "epoch": 0.624, "grad_norm": 0.2822226280837269, "learning_rate": 9.33788949130972e-06, "loss": 0.4266, "step": 39 }, { "epoch": 0.64, "grad_norm": 0.3438862449071104, "learning_rate": 9.296990019204336e-06, "loss": 0.4819, "step": 40 }, { "epoch": 0.656, "grad_norm": 0.3322076503353557, "learning_rate": 9.254971556712314e-06, "loss": 0.4597, "step": 41 }, { "epoch": 0.672, "grad_norm": 0.3262451955412676, "learning_rate": 9.21184648686741e-06, "loss": 0.4891, "step": 42 }, { "epoch": 0.688, "grad_norm": 0.27720763525504155, "learning_rate": 9.167627518825651e-06, "loss": 0.4262, "step": 43 }, { "epoch": 0.704, "grad_norm": 0.291234979530001, "learning_rate": 9.122327684119883e-06, "loss": 0.3755, "step": 44 }, { "epoch": 0.72, "grad_norm": 0.2910628070235976, "learning_rate": 9.075960332819314e-06, "loss": 0.4294, "step": 45 }, { "epoch": 0.736, "grad_norm": 0.3184275314698029, "learning_rate": 9.028539129595199e-06, "loss": 0.5769, "step": 46 }, { "epoch": 0.752, "grad_norm": 0.2722946981234995, "learning_rate": 8.980078049693785e-06, "loss": 0.4019, "step": 47 }, { "epoch": 0.768, "grad_norm": 0.29789872299343384, "learning_rate": 8.930591374817757e-06, "loss": 0.4095, "step": 48 }, { "epoch": 0.784, "grad_norm": 0.3287733650721115, "learning_rate": 8.88009368891734e-06, "loss": 0.5409, "step": 49 }, { "epoch": 0.8, "grad_norm": 0.30019238586389524, "learning_rate": 8.828599873892351e-06, "loss": 0.4375, "step": 50 }, { "epoch": 0.816, "grad_norm": 0.31598561474060005, "learning_rate": 8.776125105206433e-06, "loss": 0.4902, "step": 51 }, { "epoch": 0.832, "grad_norm": 0.31172751963714107, "learning_rate": 8.722684847414771e-06, "loss": 0.5176, "step": 52 }, { "epoch": 0.848, "grad_norm": 0.2807830457433651, "learning_rate": 8.668294849606626e-06, "loss": 0.4707, "step": 53 }, { "epoch": 0.864, "grad_norm": 0.2960941109569259, "learning_rate": 8.612971140764e-06, "loss": 0.4283, "step": 54 }, { "epoch": 0.88, "grad_norm": 0.3299368286152574, "learning_rate": 8.556730025037819e-06, "loss": 0.4088, "step": 55 }, { "epoch": 0.896, "grad_norm": 0.26428803479637797, "learning_rate": 8.499588076943036e-06, "loss": 0.3912, "step": 56 }, { "epoch": 0.912, "grad_norm": 0.3009234600421325, "learning_rate": 8.441562136474028e-06, "loss": 0.4067, "step": 57 }, { "epoch": 0.928, "grad_norm": 0.3609455923426209, "learning_rate": 8.38266930414179e-06, "loss": 0.4898, "step": 58 }, { "epoch": 0.944, "grad_norm": 0.31387386515515087, "learning_rate": 8.322926935934323e-06, "loss": 0.4282, "step": 59 }, { "epoch": 0.96, "grad_norm": 0.2845268045281065, "learning_rate": 8.262352638201754e-06, "loss": 0.4841, "step": 60 }, { "epoch": 0.976, "grad_norm": 0.30381093535089976, "learning_rate": 8.200964262467658e-06, "loss": 0.5444, "step": 61 }, { "epoch": 0.992, "grad_norm": 0.27310289060720006, "learning_rate": 8.13877990016813e-06, "loss": 0.341, "step": 62 }, { "epoch": 1.0, "grad_norm": 0.27310289060720006, "learning_rate": 8.075817877320167e-06, "loss": 0.4322, "step": 63 }, { "epoch": 1.016, "grad_norm": 0.48420660950568, "learning_rate": 8.012096749120892e-06, "loss": 0.4956, "step": 64 }, { "epoch": 1.032, "grad_norm": 0.2856122575450543, "learning_rate": 7.947635294479262e-06, "loss": 0.3808, "step": 65 }, { "epoch": 1.048, "grad_norm": 0.2959094703912616, "learning_rate": 7.882452510481834e-06, "loss": 0.4794, "step": 66 }, { "epoch": 1.064, "grad_norm": 0.2587455831416144, "learning_rate": 7.81656760679424e-06, "loss": 0.4343, "step": 67 }, { "epoch": 1.08, "grad_norm": 0.2789553742202286, "learning_rate": 7.75e-06, "loss": 0.4484, "step": 68 }, { "epoch": 1.096, "grad_norm": 0.3039866797711843, "learning_rate": 7.682769307878384e-06, "loss": 0.4165, "step": 69 }, { "epoch": 1.112, "grad_norm": 0.2588236721874104, "learning_rate": 7.614895343622941e-06, "loss": 0.4181, "step": 70 }, { "epoch": 1.1280000000000001, "grad_norm": 0.27938933998248044, "learning_rate": 7.546398110002477e-06, "loss": 0.4349, "step": 71 }, { "epoch": 1.144, "grad_norm": 0.30500409100487047, "learning_rate": 7.477297793466137e-06, "loss": 0.4195, "step": 72 }, { "epoch": 1.16, "grad_norm": 0.2703316954052495, "learning_rate": 7.407614758194375e-06, "loss": 0.4593, "step": 73 }, { "epoch": 1.176, "grad_norm": 0.29674738729978956, "learning_rate": 7.337369540097521e-06, "loss": 0.4018, "step": 74 }, { "epoch": 1.192, "grad_norm": 0.3434929792222651, "learning_rate": 7.266582840763774e-06, "loss": 0.4633, "step": 75 }, { "epoch": 1.208, "grad_norm": 0.32994268144478006, "learning_rate": 7.195275521358334e-06, "loss": 0.4982, "step": 76 }, { "epoch": 1.224, "grad_norm": 0.2751594270204829, "learning_rate": 7.123468596475526e-06, "loss": 0.4717, "step": 77 }, { "epoch": 1.24, "grad_norm": 0.3277135583956103, "learning_rate": 7.051183227945703e-06, "loss": 0.3922, "step": 78 }, { "epoch": 1.256, "grad_norm": 0.3178890245565396, "learning_rate": 6.978440718598757e-06, "loss": 0.4359, "step": 79 }, { "epoch": 1.272, "grad_norm": 0.2936671959796726, "learning_rate": 6.905262505986076e-06, "loss": 0.4482, "step": 80 }, { "epoch": 1.288, "grad_norm": 0.30577342188899176, "learning_rate": 6.8316701560628015e-06, "loss": 0.3457, "step": 81 }, { "epoch": 1.304, "grad_norm": 0.25812738212578895, "learning_rate": 6.757685356832243e-06, "loss": 0.4501, "step": 82 }, { "epoch": 1.32, "grad_norm": 0.31439500562242606, "learning_rate": 6.683329911954316e-06, "loss": 0.5262, "step": 83 }, { "epoch": 1.336, "grad_norm": 0.3508949566807093, "learning_rate": 6.608625734319917e-06, "loss": 0.3605, "step": 84 }, { "epoch": 1.3519999999999999, "grad_norm": 0.29111139241084, "learning_rate": 6.5335948395930815e-06, "loss": 0.4049, "step": 85 }, { "epoch": 1.3679999999999999, "grad_norm": 0.30291591445235805, "learning_rate": 6.458259339722871e-06, "loss": 0.4092, "step": 86 }, { "epoch": 1.384, "grad_norm": 0.29313552915613017, "learning_rate": 6.382641436426887e-06, "loss": 0.3528, "step": 87 }, { "epoch": 1.4, "grad_norm": 0.2583754481347982, "learning_rate": 6.306763414648311e-06, "loss": 0.3559, "step": 88 }, { "epoch": 1.416, "grad_norm": 0.26863426430444226, "learning_rate": 6.230647635988437e-06, "loss": 0.4254, "step": 89 }, { "epoch": 1.432, "grad_norm": 0.3093298693816597, "learning_rate": 6.154316532116605e-06, "loss": 0.3579, "step": 90 }, { "epoch": 1.448, "grad_norm": 0.25998129570390127, "learning_rate": 6.0777925981594795e-06, "loss": 0.4028, "step": 91 }, { "epoch": 1.464, "grad_norm": 0.3033921125695614, "learning_rate": 6.00109838607164e-06, "loss": 0.3805, "step": 92 }, { "epoch": 1.48, "grad_norm": 0.2904725832178519, "learning_rate": 5.924256497989411e-06, "loss": 0.4323, "step": 93 }, { "epoch": 1.496, "grad_norm": 0.30355923070799345, "learning_rate": 5.84728957956991e-06, "loss": 0.3297, "step": 94 }, { "epoch": 1.512, "grad_norm": 0.2705231290116716, "learning_rate": 5.770220313317269e-06, "loss": 0.4177, "step": 95 }, { "epoch": 1.528, "grad_norm": 0.337520528610411, "learning_rate": 5.693071411897996e-06, "loss": 0.3795, "step": 96 }, { "epoch": 1.544, "grad_norm": 0.2741225861592298, "learning_rate": 5.61586561144745e-06, "loss": 0.414, "step": 97 }, { "epoch": 1.56, "grad_norm": 0.3099099396586251, "learning_rate": 5.538625664869393e-06, "loss": 0.3715, "step": 98 }, { "epoch": 1.576, "grad_norm": 0.28067657578779776, "learning_rate": 5.46137433513061e-06, "loss": 0.4295, "step": 99 }, { "epoch": 1.592, "grad_norm": 0.2290691740751286, "learning_rate": 5.384134388552552e-06, "loss": 0.3913, "step": 100 }, { "epoch": 1.608, "grad_norm": 0.292339568550721, "learning_rate": 5.306928588102005e-06, "loss": 0.4101, "step": 101 }, { "epoch": 1.624, "grad_norm": 0.26499794509502683, "learning_rate": 5.229779686682734e-06, "loss": 0.4146, "step": 102 }, { "epoch": 1.6400000000000001, "grad_norm": 0.3102018761949683, "learning_rate": 5.152710420430092e-06, "loss": 0.43, "step": 103 }, { "epoch": 1.6560000000000001, "grad_norm": 0.28357637434683114, "learning_rate": 5.0757435020105905e-06, "loss": 0.4247, "step": 104 }, { "epoch": 1.6720000000000002, "grad_norm": 0.2829919630781859, "learning_rate": 4.998901613928361e-06, "loss": 0.4038, "step": 105 }, { "epoch": 1.688, "grad_norm": 0.29284800214525813, "learning_rate": 4.922207401840521e-06, "loss": 0.3709, "step": 106 }, { "epoch": 1.704, "grad_norm": 0.3170342016576189, "learning_rate": 4.845683467883396e-06, "loss": 0.4447, "step": 107 }, { "epoch": 1.72, "grad_norm": 0.3077243277320204, "learning_rate": 4.7693523640115646e-06, "loss": 0.3928, "step": 108 }, { "epoch": 1.736, "grad_norm": 0.2778878775122821, "learning_rate": 4.693236585351692e-06, "loss": 0.4175, "step": 109 }, { "epoch": 1.752, "grad_norm": 0.28795862892188867, "learning_rate": 4.617358563573114e-06, "loss": 0.3722, "step": 110 }, { "epoch": 1.768, "grad_norm": 0.25342664678922827, "learning_rate": 4.541740660277131e-06, "loss": 0.39, "step": 111 }, { "epoch": 1.784, "grad_norm": 0.29510553890778607, "learning_rate": 4.466405160406922e-06, "loss": 0.4833, "step": 112 }, { "epoch": 1.8, "grad_norm": 0.2793848028342464, "learning_rate": 4.391374265680084e-06, "loss": 0.3726, "step": 113 }, { "epoch": 1.8159999999999998, "grad_norm": 0.3026649324366544, "learning_rate": 4.316670088045684e-06, "loss": 0.4718, "step": 114 }, { "epoch": 1.8319999999999999, "grad_norm": 0.27728466637557236, "learning_rate": 4.242314643167759e-06, "loss": 0.3371, "step": 115 }, { "epoch": 1.8479999999999999, "grad_norm": 0.3168390463410005, "learning_rate": 4.168329843937199e-06, "loss": 0.3784, "step": 116 }, { "epoch": 1.8639999999999999, "grad_norm": 0.23724238568345682, "learning_rate": 4.094737494013925e-06, "loss": 0.4107, "step": 117 }, { "epoch": 1.88, "grad_norm": 0.3236361234128115, "learning_rate": 4.0215592814012435e-06, "loss": 0.4976, "step": 118 }, { "epoch": 1.896, "grad_norm": 0.30334182027072204, "learning_rate": 3.948816772054298e-06, "loss": 0.472, "step": 119 }, { "epoch": 1.912, "grad_norm": 0.2680853511924467, "learning_rate": 3.876531403524476e-06, "loss": 0.3274, "step": 120 }, { "epoch": 1.928, "grad_norm": 0.30193319196237894, "learning_rate": 3.804724478641667e-06, "loss": 0.3816, "step": 121 }, { "epoch": 1.944, "grad_norm": 0.26693316547906853, "learning_rate": 3.733417159236228e-06, "loss": 0.3598, "step": 122 }, { "epoch": 1.96, "grad_norm": 0.2717103343613883, "learning_rate": 3.6626304599024797e-06, "loss": 0.3841, "step": 123 }, { "epoch": 1.976, "grad_norm": 0.31425281419423284, "learning_rate": 3.592385241805628e-06, "loss": 0.4168, "step": 124 }, { "epoch": 1.992, "grad_norm": 0.25599583818936533, "learning_rate": 3.5227022065338623e-06, "loss": 0.4434, "step": 125 }, { "epoch": 2.0, "grad_norm": 0.531221874086192, "learning_rate": 3.4536018899975255e-06, "loss": 0.3939, "step": 126 }, { "epoch": 2.016, "grad_norm": 0.26919931935198743, "learning_rate": 3.3851046563770617e-06, "loss": 0.3551, "step": 127 }, { "epoch": 2.032, "grad_norm": 0.2820331296888726, "learning_rate": 3.317230692121618e-06, "loss": 0.3248, "step": 128 }, { "epoch": 2.048, "grad_norm": 0.2560275313368148, "learning_rate": 3.2500000000000015e-06, "loss": 0.3168, "step": 129 }, { "epoch": 2.064, "grad_norm": 0.26575624913648593, "learning_rate": 3.1834323932057633e-06, "loss": 0.3388, "step": 130 }, { "epoch": 2.08, "grad_norm": 0.2882413409588101, "learning_rate": 3.117547489518167e-06, "loss": 0.416, "step": 131 }, { "epoch": 2.096, "grad_norm": 0.26749471747908243, "learning_rate": 3.0523647055207393e-06, "loss": 0.4344, "step": 132 }, { "epoch": 2.112, "grad_norm": 0.2770415005714329, "learning_rate": 2.9879032508791096e-06, "loss": 0.4068, "step": 133 }, { "epoch": 2.128, "grad_norm": 0.25222155690032644, "learning_rate": 2.9241821226798338e-06, "loss": 0.3379, "step": 134 }, { "epoch": 2.144, "grad_norm": 0.292188926393636, "learning_rate": 2.86122009983187e-06, "loss": 0.3774, "step": 135 }, { "epoch": 2.16, "grad_norm": 0.2898156735833746, "learning_rate": 2.799035737532344e-06, "loss": 0.3845, "step": 136 }, { "epoch": 2.176, "grad_norm": 0.27462927630805695, "learning_rate": 2.7376473617982456e-06, "loss": 0.3899, "step": 137 }, { "epoch": 2.192, "grad_norm": 0.28037858694245654, "learning_rate": 2.6770730640656784e-06, "loss": 0.4171, "step": 138 }, { "epoch": 2.208, "grad_norm": 0.2791430022438955, "learning_rate": 2.6173306958582125e-06, "loss": 0.3156, "step": 139 }, { "epoch": 2.224, "grad_norm": 0.29137935108000546, "learning_rate": 2.5584378635259733e-06, "loss": 0.3753, "step": 140 }, { "epoch": 2.24, "grad_norm": 0.2593660029080348, "learning_rate": 2.5004119230569655e-06, "loss": 0.3766, "step": 141 }, { "epoch": 2.2560000000000002, "grad_norm": 0.2875413507661142, "learning_rate": 2.4432699749621813e-06, "loss": 0.3967, "step": 142 }, { "epoch": 2.2720000000000002, "grad_norm": 0.2931955054483654, "learning_rate": 2.387028859236002e-06, "loss": 0.3498, "step": 143 }, { "epoch": 2.288, "grad_norm": 0.26403010549011774, "learning_rate": 2.3317051503933743e-06, "loss": 0.3428, "step": 144 }, { "epoch": 2.304, "grad_norm": 0.2777071852283896, "learning_rate": 2.2773151525852313e-06, "loss": 0.3777, "step": 145 }, { "epoch": 2.32, "grad_norm": 0.29111968942296174, "learning_rate": 2.223874894793569e-06, "loss": 0.4267, "step": 146 }, { "epoch": 2.336, "grad_norm": 0.30801800316293676, "learning_rate": 2.17140012610765e-06, "loss": 0.3731, "step": 147 }, { "epoch": 2.352, "grad_norm": 0.2950618309605883, "learning_rate": 2.119906311082662e-06, "loss": 0.4385, "step": 148 }, { "epoch": 2.368, "grad_norm": 0.24715278994212442, "learning_rate": 2.069408625182244e-06, "loss": 0.3629, "step": 149 }, { "epoch": 2.384, "grad_norm": 0.2742573979067362, "learning_rate": 2.019921950306216e-06, "loss": 0.381, "step": 150 }, { "epoch": 2.4, "grad_norm": 0.28689018960107904, "learning_rate": 1.9714608704048036e-06, "loss": 0.4096, "step": 151 }, { "epoch": 2.416, "grad_norm": 0.2624247476439798, "learning_rate": 1.924039667180687e-06, "loss": 0.3014, "step": 152 }, { "epoch": 2.432, "grad_norm": 0.269857690537919, "learning_rate": 1.8776723158801188e-06, "loss": 0.3713, "step": 153 }, { "epoch": 2.448, "grad_norm": 0.3457149846408549, "learning_rate": 1.8323724811743495e-06, "loss": 0.4165, "step": 154 }, { "epoch": 2.464, "grad_norm": 0.31737427991855693, "learning_rate": 1.78815351313259e-06, "loss": 0.4442, "step": 155 }, { "epoch": 2.48, "grad_norm": 0.3148729894178642, "learning_rate": 1.7450284432876873e-06, "loss": 0.4802, "step": 156 }, { "epoch": 2.496, "grad_norm": 0.26484117644858995, "learning_rate": 1.7030099807956649e-06, "loss": 0.3706, "step": 157 }, { "epoch": 2.512, "grad_norm": 0.2923543941519516, "learning_rate": 1.6621105086902822e-06, "loss": 0.3906, "step": 158 }, { "epoch": 2.528, "grad_norm": 0.2787856319176262, "learning_rate": 1.6223420802336933e-06, "loss": 0.3995, "step": 159 }, { "epoch": 2.544, "grad_norm": 0.2648560287594022, "learning_rate": 1.5837164153643014e-06, "loss": 0.3437, "step": 160 }, { "epoch": 2.56, "grad_norm": 0.28029815338389324, "learning_rate": 1.5462448972428334e-06, "loss": 0.4114, "step": 161 }, { "epoch": 2.576, "grad_norm": 0.32764654451552816, "learning_rate": 1.5099385688976695e-06, "loss": 0.4176, "step": 162 }, { "epoch": 2.592, "grad_norm": 0.26653675081715944, "learning_rate": 1.474808129970421e-06, "loss": 0.4394, "step": 163 }, { "epoch": 2.608, "grad_norm": 0.30303023697633824, "learning_rate": 1.4408639335626823e-06, "loss": 0.446, "step": 164 }, { "epoch": 2.624, "grad_norm": 0.30230020169875327, "learning_rate": 1.4081159831849395e-06, "loss": 0.3978, "step": 165 }, { "epoch": 2.64, "grad_norm": 0.3069707267289378, "learning_rate": 1.3765739298084792e-06, "loss": 0.3623, "step": 166 }, { "epoch": 2.656, "grad_norm": 0.27712650857341004, "learning_rate": 1.346247069021208e-06, "loss": 0.3382, "step": 167 }, { "epoch": 2.672, "grad_norm": 0.2699015904710055, "learning_rate": 1.3171443382881993e-06, "loss": 0.4117, "step": 168 }, { "epoch": 2.6879999999999997, "grad_norm": 0.29159348133947205, "learning_rate": 1.2892743143177793e-06, "loss": 0.3288, "step": 169 }, { "epoch": 2.7039999999999997, "grad_norm": 0.24614166945136878, "learning_rate": 1.262645210533934e-06, "loss": 0.4084, "step": 170 }, { "epoch": 2.7199999999999998, "grad_norm": 0.2758825153172464, "learning_rate": 1.2372648746557742e-06, "loss": 0.3513, "step": 171 }, { "epoch": 2.7359999999999998, "grad_norm": 0.29799910395406354, "learning_rate": 1.213140786384779e-06, "loss": 0.386, "step": 172 }, { "epoch": 2.752, "grad_norm": 0.32545648563171126, "learning_rate": 1.190280055200489e-06, "loss": 0.4041, "step": 173 }, { "epoch": 2.768, "grad_norm": 0.28037781950139645, "learning_rate": 1.1686894182653137e-06, "loss": 0.4193, "step": 174 }, { "epoch": 2.784, "grad_norm": 0.269297811870558, "learning_rate": 1.1483752384390583e-06, "loss": 0.3404, "step": 175 }, { "epoch": 2.8, "grad_norm": 0.2621451208629847, "learning_rate": 1.1293435024037592e-06, "loss": 0.4476, "step": 176 }, { "epoch": 2.816, "grad_norm": 0.2840465963543336, "learning_rate": 1.1115998188993788e-06, "loss": 0.3689, "step": 177 }, { "epoch": 2.832, "grad_norm": 0.30328731810375564, "learning_rate": 1.09514941707089e-06, "loss": 0.4219, "step": 178 }, { "epoch": 2.848, "grad_norm": 0.28062525330655147, "learning_rate": 1.0799971449272174e-06, "loss": 0.3932, "step": 179 }, { "epoch": 2.864, "grad_norm": 0.28452098866262326, "learning_rate": 1.0661474679125096e-06, "loss": 0.4198, "step": 180 }, { "epoch": 2.88, "grad_norm": 0.29354727351428, "learning_rate": 1.0536044675901534e-06, "loss": 0.3813, "step": 181 }, { "epoch": 2.896, "grad_norm": 0.3093060506201193, "learning_rate": 1.0423718404399139e-06, "loss": 0.4137, "step": 182 }, { "epoch": 2.912, "grad_norm": 0.2536429694083317, "learning_rate": 1.0324528967685698e-06, "loss": 0.4057, "step": 183 }, { "epoch": 2.928, "grad_norm": 0.278656315909379, "learning_rate": 1.0238505597343494e-06, "loss": 0.3078, "step": 184 }, { "epoch": 2.944, "grad_norm": 0.26173005919140585, "learning_rate": 1.0165673644854601e-06, "loss": 0.4326, "step": 185 }, { "epoch": 2.96, "grad_norm": 0.29711263236623364, "learning_rate": 1.0106054574129717e-06, "loss": 0.4436, "step": 186 }, { "epoch": 2.976, "grad_norm": 0.2412655286518132, "learning_rate": 1.0059665955182629e-06, "loss": 0.3899, "step": 187 }, { "epoch": 2.992, "grad_norm": 0.2744645092683126, "learning_rate": 1.0026521458952265e-06, "loss": 0.3335, "step": 188 }, { "epoch": 3.0, "grad_norm": 0.2744645092683126, "learning_rate": 1.0006630853273791e-06, "loss": 0.395, "step": 189 }, { "epoch": 3.0, "step": 189, "total_flos": 1.847238457911296e+16, "train_loss": 0.4201909417197818, "train_runtime": 1415.4735, "train_samples_per_second": 2.119, "train_steps_per_second": 0.134 } ], "logging_steps": 1, "max_steps": 189, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.847238457911296e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }