{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010101010101010102, "grad_norm": 9.4375, "learning_rate": 0.0, "loss": 2.1559, "step": 1 }, { "epoch": 0.020202020202020204, "grad_norm": 9.875, "learning_rate": 2.0000000000000003e-06, "loss": 2.3059, "step": 2 }, { "epoch": 0.030303030303030304, "grad_norm": 11.75, "learning_rate": 4.000000000000001e-06, "loss": 2.3407, "step": 3 }, { "epoch": 0.04040404040404041, "grad_norm": 10.375, "learning_rate": 6e-06, "loss": 2.3596, "step": 4 }, { "epoch": 0.050505050505050504, "grad_norm": 9.75, "learning_rate": 8.000000000000001e-06, "loss": 2.1863, "step": 5 }, { "epoch": 0.06060606060606061, "grad_norm": 8.25, "learning_rate": 1e-05, "loss": 2.1556, "step": 6 }, { "epoch": 0.0707070707070707, "grad_norm": 9.8125, "learning_rate": 9.893617021276596e-06, "loss": 2.1315, "step": 7 }, { "epoch": 0.08080808080808081, "grad_norm": 7.0, "learning_rate": 9.787234042553192e-06, "loss": 1.7968, "step": 8 }, { "epoch": 0.09090909090909091, "grad_norm": 4.1875, "learning_rate": 9.680851063829787e-06, "loss": 1.7795, "step": 9 }, { "epoch": 0.10101010101010101, "grad_norm": 2.953125, "learning_rate": 9.574468085106385e-06, "loss": 1.69, "step": 10 }, { "epoch": 0.10101010101010101, "eval_loss": 1.81484854221344, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1829, "eval_samples_per_second": 45.242, "eval_steps_per_second": 22.621, "step": 10 }, { "epoch": 0.1111111111111111, "grad_norm": 2.8125, "learning_rate": 9.46808510638298e-06, "loss": 1.6374, "step": 11 }, { "epoch": 0.12121212121212122, "grad_norm": 2.84375, "learning_rate": 9.361702127659576e-06, "loss": 1.8449, "step": 12 }, { "epoch": 0.13131313131313133, "grad_norm": 2.515625, "learning_rate": 9.255319148936171e-06, "loss": 1.6733, "step": 13 }, { "epoch": 0.1414141414141414, "grad_norm": 2.453125, "learning_rate": 9.148936170212767e-06, "loss": 1.5624, "step": 14 }, { "epoch": 0.15151515151515152, "grad_norm": 2.28125, "learning_rate": 9.042553191489362e-06, "loss": 1.4991, "step": 15 }, { "epoch": 0.16161616161616163, "grad_norm": 2.328125, "learning_rate": 8.936170212765958e-06, "loss": 1.7474, "step": 16 }, { "epoch": 0.1717171717171717, "grad_norm": 2.109375, "learning_rate": 8.829787234042555e-06, "loss": 1.6492, "step": 17 }, { "epoch": 0.18181818181818182, "grad_norm": 2.15625, "learning_rate": 8.72340425531915e-06, "loss": 1.59, "step": 18 }, { "epoch": 0.1919191919191919, "grad_norm": 1.828125, "learning_rate": 8.617021276595746e-06, "loss": 1.5473, "step": 19 }, { "epoch": 0.20202020202020202, "grad_norm": 2.078125, "learning_rate": 8.510638297872341e-06, "loss": 1.6566, "step": 20 }, { "epoch": 0.20202020202020202, "eval_loss": 1.6047254800796509, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1634, "eval_samples_per_second": 45.521, "eval_steps_per_second": 22.761, "step": 20 }, { "epoch": 0.21212121212121213, "grad_norm": 1.9296875, "learning_rate": 8.404255319148937e-06, "loss": 1.3761, "step": 21 }, { "epoch": 0.2222222222222222, "grad_norm": 1.7890625, "learning_rate": 8.297872340425532e-06, "loss": 1.5307, "step": 22 }, { "epoch": 0.23232323232323232, "grad_norm": 2.015625, "learning_rate": 8.191489361702128e-06, "loss": 1.6108, "step": 23 }, { "epoch": 0.24242424242424243, "grad_norm": 1.9296875, "learning_rate": 8.085106382978723e-06, "loss": 1.5662, "step": 24 }, { "epoch": 0.25252525252525254, "grad_norm": 1.9609375, "learning_rate": 7.97872340425532e-06, "loss": 1.6447, "step": 25 }, { "epoch": 0.26262626262626265, "grad_norm": 1.84375, "learning_rate": 7.872340425531916e-06, "loss": 1.5652, "step": 26 }, { "epoch": 0.2727272727272727, "grad_norm": 1.78125, "learning_rate": 7.765957446808511e-06, "loss": 1.5465, "step": 27 }, { "epoch": 0.2828282828282828, "grad_norm": 1.7265625, "learning_rate": 7.659574468085107e-06, "loss": 1.4737, "step": 28 }, { "epoch": 0.29292929292929293, "grad_norm": 1.765625, "learning_rate": 7.553191489361703e-06, "loss": 1.4285, "step": 29 }, { "epoch": 0.30303030303030304, "grad_norm": 1.7578125, "learning_rate": 7.446808510638298e-06, "loss": 1.5278, "step": 30 }, { "epoch": 0.30303030303030304, "eval_loss": 1.4933972358703613, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1671, "eval_samples_per_second": 45.467, "eval_steps_per_second": 22.733, "step": 30 }, { "epoch": 0.31313131313131315, "grad_norm": 1.6171875, "learning_rate": 7.340425531914894e-06, "loss": 1.317, "step": 31 }, { "epoch": 0.32323232323232326, "grad_norm": 1.796875, "learning_rate": 7.234042553191491e-06, "loss": 1.5716, "step": 32 }, { "epoch": 0.3333333333333333, "grad_norm": 1.640625, "learning_rate": 7.127659574468085e-06, "loss": 1.3389, "step": 33 }, { "epoch": 0.3434343434343434, "grad_norm": 1.6875, "learning_rate": 7.021276595744682e-06, "loss": 1.4052, "step": 34 }, { "epoch": 0.35353535353535354, "grad_norm": 1.703125, "learning_rate": 6.914893617021278e-06, "loss": 1.4422, "step": 35 }, { "epoch": 0.36363636363636365, "grad_norm": 1.6640625, "learning_rate": 6.808510638297873e-06, "loss": 1.3351, "step": 36 }, { "epoch": 0.37373737373737376, "grad_norm": 1.7265625, "learning_rate": 6.702127659574469e-06, "loss": 1.4089, "step": 37 }, { "epoch": 0.3838383838383838, "grad_norm": 1.734375, "learning_rate": 6.595744680851064e-06, "loss": 1.3695, "step": 38 }, { "epoch": 0.3939393939393939, "grad_norm": 1.7265625, "learning_rate": 6.48936170212766e-06, "loss": 1.4133, "step": 39 }, { "epoch": 0.40404040404040403, "grad_norm": 1.7109375, "learning_rate": 6.382978723404256e-06, "loss": 1.4619, "step": 40 }, { "epoch": 0.40404040404040403, "eval_loss": 1.4300950765609741, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1719, "eval_samples_per_second": 45.398, "eval_steps_per_second": 22.699, "step": 40 }, { "epoch": 0.41414141414141414, "grad_norm": 1.59375, "learning_rate": 6.276595744680851e-06, "loss": 1.3818, "step": 41 }, { "epoch": 0.42424242424242425, "grad_norm": 1.6875, "learning_rate": 6.170212765957447e-06, "loss": 1.3216, "step": 42 }, { "epoch": 0.43434343434343436, "grad_norm": 1.8671875, "learning_rate": 6.063829787234044e-06, "loss": 1.5044, "step": 43 }, { "epoch": 0.4444444444444444, "grad_norm": 1.7578125, "learning_rate": 5.957446808510638e-06, "loss": 1.4391, "step": 44 }, { "epoch": 0.45454545454545453, "grad_norm": 1.8828125, "learning_rate": 5.851063829787235e-06, "loss": 1.3796, "step": 45 }, { "epoch": 0.46464646464646464, "grad_norm": 1.984375, "learning_rate": 5.744680851063831e-06, "loss": 1.46, "step": 46 }, { "epoch": 0.47474747474747475, "grad_norm": 1.890625, "learning_rate": 5.638297872340426e-06, "loss": 1.3988, "step": 47 }, { "epoch": 0.48484848484848486, "grad_norm": 1.96875, "learning_rate": 5.531914893617022e-06, "loss": 1.3069, "step": 48 }, { "epoch": 0.494949494949495, "grad_norm": 1.890625, "learning_rate": 5.425531914893617e-06, "loss": 1.3012, "step": 49 }, { "epoch": 0.5050505050505051, "grad_norm": 1.8515625, "learning_rate": 5.319148936170213e-06, "loss": 1.3429, "step": 50 }, { "epoch": 0.5050505050505051, "eval_loss": 1.3854832649230957, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1645, "eval_samples_per_second": 45.504, "eval_steps_per_second": 22.752, "step": 50 }, { "epoch": 0.5151515151515151, "grad_norm": 1.890625, "learning_rate": 5.212765957446809e-06, "loss": 1.3141, "step": 51 }, { "epoch": 0.5252525252525253, "grad_norm": 1.78125, "learning_rate": 5.106382978723404e-06, "loss": 1.2483, "step": 52 }, { "epoch": 0.5353535353535354, "grad_norm": 1.8046875, "learning_rate": 5e-06, "loss": 1.3407, "step": 53 }, { "epoch": 0.5454545454545454, "grad_norm": 1.671875, "learning_rate": 4.893617021276596e-06, "loss": 1.2189, "step": 54 }, { "epoch": 0.5555555555555556, "grad_norm": 1.8359375, "learning_rate": 4.787234042553192e-06, "loss": 1.3621, "step": 55 }, { "epoch": 0.5656565656565656, "grad_norm": 1.765625, "learning_rate": 4.680851063829788e-06, "loss": 1.3592, "step": 56 }, { "epoch": 0.5757575757575758, "grad_norm": 1.8125, "learning_rate": 4.574468085106383e-06, "loss": 1.3573, "step": 57 }, { "epoch": 0.5858585858585859, "grad_norm": 1.7265625, "learning_rate": 4.468085106382979e-06, "loss": 1.3224, "step": 58 }, { "epoch": 0.5959595959595959, "grad_norm": 1.96875, "learning_rate": 4.361702127659575e-06, "loss": 1.3177, "step": 59 }, { "epoch": 0.6060606060606061, "grad_norm": 1.78125, "learning_rate": 4.255319148936171e-06, "loss": 1.3227, "step": 60 }, { "epoch": 0.6060606060606061, "eval_loss": 1.356658697128296, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1698, "eval_samples_per_second": 45.429, "eval_steps_per_second": 22.714, "step": 60 }, { "epoch": 0.6161616161616161, "grad_norm": 1.8203125, "learning_rate": 4.148936170212766e-06, "loss": 1.3576, "step": 61 }, { "epoch": 0.6262626262626263, "grad_norm": 1.7421875, "learning_rate": 4.042553191489362e-06, "loss": 1.2874, "step": 62 }, { "epoch": 0.6363636363636364, "grad_norm": 1.7890625, "learning_rate": 3.936170212765958e-06, "loss": 1.2097, "step": 63 }, { "epoch": 0.6464646464646465, "grad_norm": 1.765625, "learning_rate": 3.8297872340425535e-06, "loss": 1.2487, "step": 64 }, { "epoch": 0.6565656565656566, "grad_norm": 1.9765625, "learning_rate": 3.723404255319149e-06, "loss": 1.3143, "step": 65 }, { "epoch": 0.6666666666666666, "grad_norm": 1.9296875, "learning_rate": 3.6170212765957453e-06, "loss": 1.3207, "step": 66 }, { "epoch": 0.6767676767676768, "grad_norm": 1.796875, "learning_rate": 3.510638297872341e-06, "loss": 1.4014, "step": 67 }, { "epoch": 0.6868686868686869, "grad_norm": 1.8671875, "learning_rate": 3.4042553191489363e-06, "loss": 1.2789, "step": 68 }, { "epoch": 0.696969696969697, "grad_norm": 1.7734375, "learning_rate": 3.297872340425532e-06, "loss": 1.3333, "step": 69 }, { "epoch": 0.7070707070707071, "grad_norm": 2.015625, "learning_rate": 3.191489361702128e-06, "loss": 1.4142, "step": 70 }, { "epoch": 0.7070707070707071, "eval_loss": 1.336801528930664, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1642, "eval_samples_per_second": 45.51, "eval_steps_per_second": 22.755, "step": 70 }, { "epoch": 0.7171717171717171, "grad_norm": 1.859375, "learning_rate": 3.0851063829787237e-06, "loss": 1.3983, "step": 71 }, { "epoch": 0.7272727272727273, "grad_norm": 1.875, "learning_rate": 2.978723404255319e-06, "loss": 1.3606, "step": 72 }, { "epoch": 0.7373737373737373, "grad_norm": 1.9140625, "learning_rate": 2.8723404255319155e-06, "loss": 1.4158, "step": 73 }, { "epoch": 0.7474747474747475, "grad_norm": 2.015625, "learning_rate": 2.765957446808511e-06, "loss": 1.273, "step": 74 }, { "epoch": 0.7575757575757576, "grad_norm": 1.8203125, "learning_rate": 2.6595744680851065e-06, "loss": 1.2416, "step": 75 }, { "epoch": 0.7676767676767676, "grad_norm": 1.8203125, "learning_rate": 2.553191489361702e-06, "loss": 1.3183, "step": 76 }, { "epoch": 0.7777777777777778, "grad_norm": 1.75, "learning_rate": 2.446808510638298e-06, "loss": 1.2538, "step": 77 }, { "epoch": 0.7878787878787878, "grad_norm": 1.9375, "learning_rate": 2.340425531914894e-06, "loss": 1.2425, "step": 78 }, { "epoch": 0.797979797979798, "grad_norm": 1.7890625, "learning_rate": 2.2340425531914894e-06, "loss": 1.2664, "step": 79 }, { "epoch": 0.8080808080808081, "grad_norm": 1.8515625, "learning_rate": 2.1276595744680853e-06, "loss": 1.3056, "step": 80 }, { "epoch": 0.8080808080808081, "eval_loss": 1.3236327171325684, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1591, "eval_samples_per_second": 45.583, "eval_steps_per_second": 22.792, "step": 80 }, { "epoch": 0.8181818181818182, "grad_norm": 1.6875, "learning_rate": 2.021276595744681e-06, "loss": 1.2309, "step": 81 }, { "epoch": 0.8282828282828283, "grad_norm": 2.125, "learning_rate": 1.9148936170212767e-06, "loss": 1.3102, "step": 82 }, { "epoch": 0.8383838383838383, "grad_norm": 2.03125, "learning_rate": 1.8085106382978727e-06, "loss": 1.2998, "step": 83 }, { "epoch": 0.8484848484848485, "grad_norm": 1.90625, "learning_rate": 1.7021276595744682e-06, "loss": 1.3077, "step": 84 }, { "epoch": 0.8585858585858586, "grad_norm": 1.9140625, "learning_rate": 1.595744680851064e-06, "loss": 1.3, "step": 85 }, { "epoch": 0.8686868686868687, "grad_norm": 1.6953125, "learning_rate": 1.4893617021276596e-06, "loss": 1.2836, "step": 86 }, { "epoch": 0.8787878787878788, "grad_norm": 2.015625, "learning_rate": 1.3829787234042555e-06, "loss": 1.2684, "step": 87 }, { "epoch": 0.8888888888888888, "grad_norm": 2.0, "learning_rate": 1.276595744680851e-06, "loss": 1.2866, "step": 88 }, { "epoch": 0.898989898989899, "grad_norm": 1.875, "learning_rate": 1.170212765957447e-06, "loss": 1.4256, "step": 89 }, { "epoch": 0.9090909090909091, "grad_norm": 1.875, "learning_rate": 1.0638297872340427e-06, "loss": 1.2093, "step": 90 }, { "epoch": 0.9090909090909091, "eval_loss": 1.31633722782135, "eval_model_preparation_time": 0.0169, "eval_runtime": 3.1542, "eval_samples_per_second": 45.653, "eval_steps_per_second": 22.827, "step": 90 }, { "epoch": 0.9191919191919192, "grad_norm": 1.8359375, "learning_rate": 9.574468085106384e-07, "loss": 1.3478, "step": 91 }, { "epoch": 0.9292929292929293, "grad_norm": 2.1875, "learning_rate": 8.510638297872341e-07, "loss": 1.2843, "step": 92 }, { "epoch": 0.9393939393939394, "grad_norm": 1.8125, "learning_rate": 7.446808510638298e-07, "loss": 1.2619, "step": 93 }, { "epoch": 0.9494949494949495, "grad_norm": 1.6875, "learning_rate": 6.382978723404255e-07, "loss": 1.3074, "step": 94 }, { "epoch": 0.9595959595959596, "grad_norm": 1.921875, "learning_rate": 5.319148936170213e-07, "loss": 1.3186, "step": 95 }, { "epoch": 0.9696969696969697, "grad_norm": 2.03125, "learning_rate": 4.2553191489361704e-07, "loss": 1.3499, "step": 96 }, { "epoch": 0.9797979797979798, "grad_norm": 2.0, "learning_rate": 3.1914893617021275e-07, "loss": 1.2827, "step": 97 }, { "epoch": 0.98989898989899, "grad_norm": 1.859375, "learning_rate": 2.1276595744680852e-07, "loss": 1.1419, "step": 98 }, { "epoch": 1.0, "grad_norm": 1.8828125, "learning_rate": 1.0638297872340426e-07, "loss": 1.2356, "step": 99 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5524656076046336e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }