{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.970278303161308, "eval_steps": 500, "global_step": 36900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0810591731964334, "grad_norm": 1.3518340587615967, "learning_rate": 8.08108108108108e-05, "loss": 6.5209, "step": 300 }, { "epoch": 0.1621183463928668, "grad_norm": 1.4490776062011719, "learning_rate": 9.9375e-05, "loss": 4.8248, "step": 600 }, { "epoch": 0.2431775195893002, "grad_norm": 0.9239162802696228, "learning_rate": 9.855622270742358e-05, "loss": 4.6051, "step": 900 }, { "epoch": 0.3242366927857336, "grad_norm": 0.8636089563369751, "learning_rate": 9.773744541484717e-05, "loss": 4.5204, "step": 1200 }, { "epoch": 0.405295865982167, "grad_norm": 0.8884899020195007, "learning_rate": 9.691866812227075e-05, "loss": 4.4294, "step": 1500 }, { "epoch": 0.4863550391786004, "grad_norm": 0.9425375461578369, "learning_rate": 9.609989082969433e-05, "loss": 4.3436, "step": 1800 }, { "epoch": 0.5674142123750338, "grad_norm": 0.9191689491271973, "learning_rate": 9.528111353711791e-05, "loss": 4.3269, "step": 2100 }, { "epoch": 0.6484733855714672, "grad_norm": 0.8771488666534424, "learning_rate": 9.446233624454149e-05, "loss": 4.2743, "step": 2400 }, { "epoch": 0.7295325587679006, "grad_norm": 0.9781838059425354, "learning_rate": 9.364355895196507e-05, "loss": 4.1491, "step": 2700 }, { "epoch": 0.810591731964334, "grad_norm": 1.0138261318206787, "learning_rate": 9.282478165938865e-05, "loss": 4.047, "step": 3000 }, { "epoch": 0.8916509051607674, "grad_norm": 1.063523769378662, "learning_rate": 9.200600436681222e-05, "loss": 4.0248, "step": 3300 }, { "epoch": 0.9727100783572008, "grad_norm": 0.9346688389778137, "learning_rate": 9.118722707423582e-05, "loss": 3.9609, "step": 3600 }, { "epoch": 1.053769251553634, "grad_norm": 0.9149222373962402, "learning_rate": 9.036844978165939e-05, "loss": 3.9169, "step": 3900 }, { "epoch": 1.1348284247500675, "grad_norm": 1.1430439949035645, "learning_rate": 8.954967248908297e-05, "loss": 3.8597, "step": 4200 }, { "epoch": 1.215887597946501, "grad_norm": 0.9831075072288513, "learning_rate": 8.873089519650656e-05, "loss": 3.8311, "step": 4500 }, { "epoch": 1.2969467711429343, "grad_norm": 1.0454624891281128, "learning_rate": 8.791211790393013e-05, "loss": 3.7607, "step": 4800 }, { "epoch": 1.3780059443393677, "grad_norm": 0.9663439393043518, "learning_rate": 8.709334061135371e-05, "loss": 3.7721, "step": 5100 }, { "epoch": 1.4590651175358011, "grad_norm": 1.133966326713562, "learning_rate": 8.627456331877731e-05, "loss": 3.7393, "step": 5400 }, { "epoch": 1.5401242907322346, "grad_norm": 0.9640753865242004, "learning_rate": 8.545578602620087e-05, "loss": 3.7126, "step": 5700 }, { "epoch": 1.621183463928668, "grad_norm": 1.0320396423339844, "learning_rate": 8.463700873362446e-05, "loss": 3.6677, "step": 6000 }, { "epoch": 1.7022426371251014, "grad_norm": 0.991076648235321, "learning_rate": 8.381823144104804e-05, "loss": 3.6452, "step": 6300 }, { "epoch": 1.7833018103215346, "grad_norm": 1.1129027605056763, "learning_rate": 8.299945414847162e-05, "loss": 3.6468, "step": 6600 }, { "epoch": 1.864360983517968, "grad_norm": 1.1750997304916382, "learning_rate": 8.21806768558952e-05, "loss": 3.6052, "step": 6900 }, { "epoch": 1.9454201567144014, "grad_norm": 1.077567219734192, "learning_rate": 8.136189956331878e-05, "loss": 3.5974, "step": 7200 }, { "epoch": 2.026479329910835, "grad_norm": 1.1307553052902222, "learning_rate": 8.054312227074236e-05, "loss": 3.5258, "step": 7500 }, { "epoch": 2.107538503107268, "grad_norm": 0.9726724028587341, "learning_rate": 7.972434497816595e-05, "loss": 3.473, "step": 7800 }, { "epoch": 2.1885976763037016, "grad_norm": 1.080590844154358, "learning_rate": 7.890556768558953e-05, "loss": 3.4878, "step": 8100 }, { "epoch": 2.269656849500135, "grad_norm": 1.0957530736923218, "learning_rate": 7.808679039301311e-05, "loss": 3.473, "step": 8400 }, { "epoch": 2.3507160226965684, "grad_norm": 1.1584407091140747, "learning_rate": 7.726801310043669e-05, "loss": 3.4178, "step": 8700 }, { "epoch": 2.431775195893002, "grad_norm": 1.1033746004104614, "learning_rate": 7.644923580786027e-05, "loss": 3.4216, "step": 9000 }, { "epoch": 2.5128343690894352, "grad_norm": 1.035995364189148, "learning_rate": 7.563045851528384e-05, "loss": 3.4284, "step": 9300 }, { "epoch": 2.5938935422858687, "grad_norm": 1.1373919248580933, "learning_rate": 7.481168122270743e-05, "loss": 3.419, "step": 9600 }, { "epoch": 2.674952715482302, "grad_norm": 1.0167981386184692, "learning_rate": 7.3992903930131e-05, "loss": 3.3995, "step": 9900 }, { "epoch": 2.7560118886787355, "grad_norm": 1.1506949663162231, "learning_rate": 7.317412663755458e-05, "loss": 3.4456, "step": 10200 }, { "epoch": 2.837071061875169, "grad_norm": 1.281085729598999, "learning_rate": 7.235534934497818e-05, "loss": 3.3682, "step": 10500 }, { "epoch": 2.9181302350716023, "grad_norm": 1.1729835271835327, "learning_rate": 7.153657205240175e-05, "loss": 3.3507, "step": 10800 }, { "epoch": 2.9991894082680357, "grad_norm": 1.1416144371032715, "learning_rate": 7.071779475982533e-05, "loss": 3.3374, "step": 11100 }, { "epoch": 3.080248581464469, "grad_norm": 1.2445170879364014, "learning_rate": 6.989901746724891e-05, "loss": 3.2347, "step": 11400 }, { "epoch": 3.1613077546609025, "grad_norm": 1.1831103563308716, "learning_rate": 6.908024017467249e-05, "loss": 3.2341, "step": 11700 }, { "epoch": 3.242366927857336, "grad_norm": 1.1428508758544922, "learning_rate": 6.826146288209607e-05, "loss": 3.2382, "step": 12000 }, { "epoch": 3.3234261010537693, "grad_norm": 1.2155035734176636, "learning_rate": 6.744268558951965e-05, "loss": 3.2109, "step": 12300 }, { "epoch": 3.4044852742502028, "grad_norm": 1.1355677843093872, "learning_rate": 6.662390829694324e-05, "loss": 3.2283, "step": 12600 }, { "epoch": 3.485544447446636, "grad_norm": 1.2202376127243042, "learning_rate": 6.580513100436682e-05, "loss": 3.2252, "step": 12900 }, { "epoch": 3.5666036206430696, "grad_norm": 1.2496229410171509, "learning_rate": 6.49863537117904e-05, "loss": 3.2408, "step": 13200 }, { "epoch": 3.647662793839503, "grad_norm": 1.1879061460494995, "learning_rate": 6.416757641921398e-05, "loss": 3.1999, "step": 13500 }, { "epoch": 3.7287219670359364, "grad_norm": 1.2078286409378052, "learning_rate": 6.334879912663756e-05, "loss": 3.184, "step": 13800 }, { "epoch": 3.80978114023237, "grad_norm": 1.157354712486267, "learning_rate": 6.253002183406114e-05, "loss": 3.1967, "step": 14100 }, { "epoch": 3.8908403134288028, "grad_norm": 1.0799527168273926, "learning_rate": 6.171124454148471e-05, "loss": 3.1845, "step": 14400 }, { "epoch": 3.9718994866252366, "grad_norm": 1.162443995475769, "learning_rate": 6.0892467248908306e-05, "loss": 3.1956, "step": 14700 }, { "epoch": 4.05295865982167, "grad_norm": 1.362560749053955, "learning_rate": 6.007368995633188e-05, "loss": 3.0933, "step": 15000 }, { "epoch": 4.134017833018103, "grad_norm": 1.3539903163909912, "learning_rate": 5.9254912663755455e-05, "loss": 3.0665, "step": 15300 }, { "epoch": 4.215077006214536, "grad_norm": 1.2945058345794678, "learning_rate": 5.843613537117904e-05, "loss": 3.0822, "step": 15600 }, { "epoch": 4.29613617941097, "grad_norm": 1.2873772382736206, "learning_rate": 5.7617358078602625e-05, "loss": 3.032, "step": 15900 }, { "epoch": 4.377195352607403, "grad_norm": 1.3318586349487305, "learning_rate": 5.67985807860262e-05, "loss": 3.0344, "step": 16200 }, { "epoch": 4.458254525803837, "grad_norm": 1.3891690969467163, "learning_rate": 5.597980349344979e-05, "loss": 3.0648, "step": 16500 }, { "epoch": 4.53931369900027, "grad_norm": 1.3675804138183594, "learning_rate": 5.516102620087337e-05, "loss": 3.0373, "step": 16800 }, { "epoch": 4.620372872196704, "grad_norm": 1.3631969690322876, "learning_rate": 5.4342248908296944e-05, "loss": 3.0216, "step": 17100 }, { "epoch": 4.701432045393137, "grad_norm": 1.534043550491333, "learning_rate": 5.352347161572052e-05, "loss": 3.068, "step": 17400 }, { "epoch": 4.782491218589571, "grad_norm": 1.3604155778884888, "learning_rate": 5.2704694323144114e-05, "loss": 3.04, "step": 17700 }, { "epoch": 4.863550391786004, "grad_norm": 1.4917387962341309, "learning_rate": 5.188591703056769e-05, "loss": 3.0478, "step": 18000 }, { "epoch": 4.9446095649824375, "grad_norm": 1.4303983449935913, "learning_rate": 5.106713973799126e-05, "loss": 3.012, "step": 18300 }, { "epoch": 5.0256687381788705, "grad_norm": 1.5453460216522217, "learning_rate": 5.024836244541485e-05, "loss": 2.9692, "step": 18600 }, { "epoch": 5.106727911375304, "grad_norm": 1.4777169227600098, "learning_rate": 4.942958515283843e-05, "loss": 2.9011, "step": 18900 }, { "epoch": 5.187787084571737, "grad_norm": 1.5853521823883057, "learning_rate": 4.861080786026201e-05, "loss": 2.9105, "step": 19200 }, { "epoch": 5.268846257768171, "grad_norm": 1.4681776762008667, "learning_rate": 4.779203056768559e-05, "loss": 2.8882, "step": 19500 }, { "epoch": 5.349905430964604, "grad_norm": 1.3525965213775635, "learning_rate": 4.697325327510918e-05, "loss": 2.9123, "step": 19800 }, { "epoch": 5.430964604161037, "grad_norm": 1.4700337648391724, "learning_rate": 4.615447598253275e-05, "loss": 2.8784, "step": 20100 }, { "epoch": 5.512023777357471, "grad_norm": 1.5051637887954712, "learning_rate": 4.533569868995633e-05, "loss": 2.9082, "step": 20400 }, { "epoch": 5.593082950553905, "grad_norm": 1.4689549207687378, "learning_rate": 4.4516921397379915e-05, "loss": 2.9082, "step": 20700 }, { "epoch": 5.674142123750338, "grad_norm": 1.5370858907699585, "learning_rate": 4.3698144104803496e-05, "loss": 2.9034, "step": 21000 }, { "epoch": 5.755201296946771, "grad_norm": 1.5931947231292725, "learning_rate": 4.287936681222708e-05, "loss": 2.8862, "step": 21300 }, { "epoch": 5.836260470143205, "grad_norm": 1.4349807500839233, "learning_rate": 4.206058951965066e-05, "loss": 2.891, "step": 21600 }, { "epoch": 5.917319643339638, "grad_norm": 1.625874400138855, "learning_rate": 4.124181222707424e-05, "loss": 2.87, "step": 21900 }, { "epoch": 5.998378816536071, "grad_norm": 1.4641919136047363, "learning_rate": 4.0423034934497815e-05, "loss": 2.8981, "step": 22200 }, { "epoch": 6.079437989732504, "grad_norm": 1.3958481550216675, "learning_rate": 3.96042576419214e-05, "loss": 2.7289, "step": 22500 }, { "epoch": 6.160497162928938, "grad_norm": 1.586595058441162, "learning_rate": 3.8785480349344985e-05, "loss": 2.7348, "step": 22800 }, { "epoch": 6.241556336125371, "grad_norm": 1.7938852310180664, "learning_rate": 3.796670305676856e-05, "loss": 2.7383, "step": 23100 }, { "epoch": 6.322615509321805, "grad_norm": 1.7327919006347656, "learning_rate": 3.714792576419214e-05, "loss": 2.7897, "step": 23400 }, { "epoch": 6.403674682518238, "grad_norm": 1.5519119501113892, "learning_rate": 3.632914847161572e-05, "loss": 2.7947, "step": 23700 }, { "epoch": 6.484733855714672, "grad_norm": 1.7375255823135376, "learning_rate": 3.5510371179039304e-05, "loss": 2.79, "step": 24000 }, { "epoch": 6.565793028911105, "grad_norm": 1.8126753568649292, "learning_rate": 3.4691593886462886e-05, "loss": 2.7754, "step": 24300 }, { "epoch": 6.646852202107539, "grad_norm": 1.8342634439468384, "learning_rate": 3.387281659388647e-05, "loss": 2.7628, "step": 24600 }, { "epoch": 6.727911375303972, "grad_norm": 1.7573124170303345, "learning_rate": 3.305403930131005e-05, "loss": 2.7771, "step": 24900 }, { "epoch": 6.8089705485004055, "grad_norm": 1.7912206649780273, "learning_rate": 3.2235262008733623e-05, "loss": 2.7588, "step": 25200 }, { "epoch": 6.8900297216968385, "grad_norm": 1.5853058099746704, "learning_rate": 3.1416484716157205e-05, "loss": 2.7534, "step": 25500 }, { "epoch": 6.971088894893272, "grad_norm": 1.6281930208206177, "learning_rate": 3.0597707423580786e-05, "loss": 2.7513, "step": 25800 }, { "epoch": 7.052148068089705, "grad_norm": 1.8934043645858765, "learning_rate": 2.9778930131004368e-05, "loss": 2.7019, "step": 26100 }, { "epoch": 7.133207241286139, "grad_norm": 1.7691494226455688, "learning_rate": 2.896015283842795e-05, "loss": 2.6608, "step": 26400 }, { "epoch": 7.214266414482572, "grad_norm": 1.8065195083618164, "learning_rate": 2.8141375545851527e-05, "loss": 2.6537, "step": 26700 }, { "epoch": 7.295325587679006, "grad_norm": 1.902382254600525, "learning_rate": 2.7322598253275112e-05, "loss": 2.6414, "step": 27000 }, { "epoch": 7.376384760875439, "grad_norm": 1.7119675874710083, "learning_rate": 2.6503820960698687e-05, "loss": 2.6422, "step": 27300 }, { "epoch": 7.457443934071873, "grad_norm": 1.9427614212036133, "learning_rate": 2.5685043668122272e-05, "loss": 2.6449, "step": 27600 }, { "epoch": 7.538503107268306, "grad_norm": 1.9298570156097412, "learning_rate": 2.4866266375545853e-05, "loss": 2.642, "step": 27900 }, { "epoch": 7.61956228046474, "grad_norm": 1.762351393699646, "learning_rate": 2.404748908296943e-05, "loss": 2.6455, "step": 28200 }, { "epoch": 7.700621453661173, "grad_norm": 1.7794419527053833, "learning_rate": 2.3228711790393016e-05, "loss": 2.6509, "step": 28500 }, { "epoch": 7.781680626857606, "grad_norm": 1.777873158454895, "learning_rate": 2.2409934497816594e-05, "loss": 2.6572, "step": 28800 }, { "epoch": 7.862739800054039, "grad_norm": 1.815291166305542, "learning_rate": 2.1591157205240176e-05, "loss": 2.6432, "step": 29100 }, { "epoch": 7.943798973250473, "grad_norm": 1.8740344047546387, "learning_rate": 2.0772379912663757e-05, "loss": 2.663, "step": 29400 }, { "epoch": 8.024858146446906, "grad_norm": 1.8893027305603027, "learning_rate": 1.9953602620087335e-05, "loss": 2.5881, "step": 29700 }, { "epoch": 8.10591731964334, "grad_norm": 1.9792145490646362, "learning_rate": 1.913482532751092e-05, "loss": 2.5475, "step": 30000 }, { "epoch": 8.186976492839772, "grad_norm": 1.8607609272003174, "learning_rate": 1.83160480349345e-05, "loss": 2.5636, "step": 30300 }, { "epoch": 8.268035666036207, "grad_norm": 1.8906886577606201, "learning_rate": 1.749727074235808e-05, "loss": 2.5569, "step": 30600 }, { "epoch": 8.34909483923264, "grad_norm": 2.0039448738098145, "learning_rate": 1.667849344978166e-05, "loss": 2.553, "step": 30900 }, { "epoch": 8.430154012429073, "grad_norm": 1.9279450178146362, "learning_rate": 1.585971615720524e-05, "loss": 2.5327, "step": 31200 }, { "epoch": 8.511213185625508, "grad_norm": 2.061372756958008, "learning_rate": 1.5040938864628823e-05, "loss": 2.5668, "step": 31500 }, { "epoch": 8.59227235882194, "grad_norm": 2.1464438438415527, "learning_rate": 1.4222161572052402e-05, "loss": 2.5658, "step": 31800 }, { "epoch": 8.673331532018373, "grad_norm": 2.0610902309417725, "learning_rate": 1.3403384279475984e-05, "loss": 2.5592, "step": 32100 }, { "epoch": 8.754390705214806, "grad_norm": 2.092325448989868, "learning_rate": 1.2584606986899564e-05, "loss": 2.543, "step": 32400 }, { "epoch": 8.83544987841124, "grad_norm": 2.0462098121643066, "learning_rate": 1.1765829694323145e-05, "loss": 2.5544, "step": 32700 }, { "epoch": 8.916509051607674, "grad_norm": 2.0339772701263428, "learning_rate": 1.0947052401746725e-05, "loss": 2.5651, "step": 33000 }, { "epoch": 8.997568224804107, "grad_norm": 2.069972276687622, "learning_rate": 1.0128275109170306e-05, "loss": 2.5323, "step": 33300 }, { "epoch": 9.07862739800054, "grad_norm": 2.0698232650756836, "learning_rate": 9.309497816593888e-06, "loss": 2.4645, "step": 33600 }, { "epoch": 9.159686571196973, "grad_norm": 2.0408966541290283, "learning_rate": 8.490720524017468e-06, "loss": 2.4776, "step": 33900 }, { "epoch": 9.240745744393408, "grad_norm": 2.258899688720703, "learning_rate": 7.671943231441048e-06, "loss": 2.5065, "step": 34200 }, { "epoch": 9.32180491758984, "grad_norm": 1.9570540189743042, "learning_rate": 6.853165938864629e-06, "loss": 2.4528, "step": 34500 }, { "epoch": 9.402864090786274, "grad_norm": 2.107398509979248, "learning_rate": 6.03438864628821e-06, "loss": 2.4907, "step": 34800 }, { "epoch": 9.483923263982707, "grad_norm": 1.9422398805618286, "learning_rate": 5.21561135371179e-06, "loss": 2.4871, "step": 35100 }, { "epoch": 9.564982437179141, "grad_norm": 2.015700340270996, "learning_rate": 4.396834061135372e-06, "loss": 2.476, "step": 35400 }, { "epoch": 9.646041610375574, "grad_norm": 2.022306203842163, "learning_rate": 3.5780567685589524e-06, "loss": 2.4837, "step": 35700 }, { "epoch": 9.727100783572007, "grad_norm": 2.170642375946045, "learning_rate": 2.759279475982533e-06, "loss": 2.4854, "step": 36000 }, { "epoch": 9.80815995676844, "grad_norm": 2.0419552326202393, "learning_rate": 1.9405021834061136e-06, "loss": 2.4956, "step": 36300 }, { "epoch": 9.889219129964875, "grad_norm": 2.17526912689209, "learning_rate": 1.1217248908296945e-06, "loss": 2.5115, "step": 36600 }, { "epoch": 9.970278303161308, "grad_norm": 1.8749662637710571, "learning_rate": 3.0294759825327515e-07, "loss": 2.4465, "step": 36900 } ], "logging_steps": 300, "max_steps": 37010, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.7119298076672e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }