{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4705882352941178, "eval_steps": 500, "global_step": 25, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 3.6089425086975098, "epoch": 0.058823529411764705, "grad_norm": NaN, "learning_rate": 0.0, "loss": 6.292151927947998, "mean_token_accuracy": 0.0, "num_tokens": 15.0, "step": 1 }, { "entropy": 5.715427875518799, "epoch": 0.11764705882352941, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 6.3516693115234375, "mean_token_accuracy": 0.0, "num_tokens": 22.0, "step": 2 }, { "entropy": 6.061905860900879, "epoch": 0.17647058823529413, "grad_norm": NaN, "learning_rate": 0.0, "loss": 4.695749282836914, "mean_token_accuracy": 0.3333333432674408, "num_tokens": 29.0, "step": 3 }, { "entropy": 5.138950824737549, "epoch": 0.23529411764705882, "grad_norm": 34.225440979003906, "learning_rate": 0.0, "loss": 4.924757957458496, "mean_token_accuracy": 0.375, "num_tokens": 41.0, "step": 4 }, { "entropy": 5.107641220092773, "epoch": 0.29411764705882354, "grad_norm": Infinity, "learning_rate": 4.000000000000001e-06, "loss": 4.769550800323486, "mean_token_accuracy": 0.0, "num_tokens": 47.0, "step": 5 }, { "entropy": 4.351223945617676, "epoch": 0.35294117647058826, "grad_norm": 26.4381046295166, "learning_rate": 4.000000000000001e-06, "loss": 2.740773916244507, "mean_token_accuracy": 0.4285714328289032, "num_tokens": 62.0, "step": 6 }, { "entropy": 4.605663299560547, "epoch": 0.4117647058823529, "grad_norm": 44.312320709228516, "learning_rate": 8.000000000000001e-06, "loss": 4.73244571685791, "mean_token_accuracy": 0.25, "num_tokens": 70.0, "step": 7 }, { "entropy": 4.652010440826416, "epoch": 0.47058823529411764, "grad_norm": 29.16073989868164, "learning_rate": 1.2e-05, "loss": 3.551349639892578, "mean_token_accuracy": 0.3333333432674408, "num_tokens": 83.0, "step": 8 }, { "entropy": 3.92993426322937, "epoch": 0.5294117647058824, "grad_norm": 24.797420501708984, "learning_rate": 1.6000000000000003e-05, "loss": 3.086703300476074, "mean_token_accuracy": 0.5454545617103577, "num_tokens": 101.0, "step": 9 }, { "entropy": 4.687472343444824, "epoch": 0.5882352941176471, "grad_norm": 65.40557098388672, "learning_rate": 2e-05, "loss": 6.584832668304443, "mean_token_accuracy": 0.0, "num_tokens": 106.0, "step": 10 }, { "entropy": 5.460619926452637, "epoch": 0.6470588235294118, "grad_norm": 62.57985305786133, "learning_rate": 1.9975640502598243e-05, "loss": 5.282391548156738, "mean_token_accuracy": 0.3333333432674408, "num_tokens": 111.0, "step": 11 }, { "entropy": 5.458609580993652, "epoch": 0.7058823529411765, "grad_norm": 47.68376922607422, "learning_rate": 1.9902680687415704e-05, "loss": 3.4104318618774414, "mean_token_accuracy": 0.5, "num_tokens": 118.0, "step": 12 }, { "entropy": 5.297245979309082, "epoch": 0.7647058823529411, "grad_norm": 27.625783920288086, "learning_rate": 1.9781476007338058e-05, "loss": 2.5839743614196777, "mean_token_accuracy": 0.6000000238418579, "num_tokens": 126.0, "step": 13 }, { "entropy": 4.2264180183410645, "epoch": 0.8235294117647058, "grad_norm": 21.131492614746094, "learning_rate": 1.961261695938319e-05, "loss": 2.5414154529571533, "mean_token_accuracy": 0.5714285969734192, "num_tokens": 138.0, "step": 14 }, { "entropy": 4.7774658203125, "epoch": 0.8823529411764706, "grad_norm": 40.83763885498047, "learning_rate": 1.9396926207859085e-05, "loss": 2.8697211742401123, "mean_token_accuracy": 0.4000000059604645, "num_tokens": 145.0, "step": 15 }, { "entropy": 5.260831832885742, "epoch": 0.9411764705882353, "grad_norm": 40.107913970947266, "learning_rate": 1.913545457642601e-05, "loss": 2.11948561668396, "mean_token_accuracy": 0.3333333432674408, "num_tokens": 153.0, "step": 16 }, { "entropy": 5.279318332672119, "epoch": 1.0, "grad_norm": 34.499481201171875, "learning_rate": 1.8829475928589272e-05, "loss": 2.028179883956909, "mean_token_accuracy": 0.5, "num_tokens": 161.0, "step": 17 }, { "entropy": 3.6013762950897217, "epoch": 1.0588235294117647, "grad_norm": 45.223697662353516, "learning_rate": 1.848048096156426e-05, "loss": 4.955894470214844, "mean_token_accuracy": 0.3333333432674408, "num_tokens": 176.0, "step": 18 }, { "entropy": 4.529469966888428, "epoch": 1.1176470588235294, "grad_norm": 30.39960479736328, "learning_rate": 1.8090169943749477e-05, "loss": 1.4438493251800537, "mean_token_accuracy": 0.6000000238418579, "num_tokens": 184.0, "step": 19 }, { "entropy": 3.973909378051758, "epoch": 1.1764705882352942, "grad_norm": 18.265600204467773, "learning_rate": 1.766044443118978e-05, "loss": 1.7491637468338013, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 196.0, "step": 20 }, { "entropy": 4.81941556930542, "epoch": 1.2352941176470589, "grad_norm": 37.519798278808594, "learning_rate": 1.7193398003386514e-05, "loss": 3.381870985031128, "mean_token_accuracy": 0.3333333432674408, "num_tokens": 202.0, "step": 21 }, { "entropy": 4.61383056640625, "epoch": 1.2941176470588236, "grad_norm": 24.117401123046875, "learning_rate": 1.6691306063588583e-05, "loss": 0.9651630520820618, "mean_token_accuracy": 0.75, "num_tokens": 210.0, "step": 22 }, { "entropy": 4.4701128005981445, "epoch": 1.3529411764705883, "grad_norm": 28.767288208007812, "learning_rate": 1.6156614753256583e-05, "loss": 0.8833061456680298, "mean_token_accuracy": 1.0, "num_tokens": 217.0, "step": 23 }, { "entropy": 4.853976726531982, "epoch": 1.4117647058823528, "grad_norm": 41.10981369018555, "learning_rate": 1.5591929034707468e-05, "loss": 3.586022138595581, "mean_token_accuracy": 0.3333333432674408, "num_tokens": 222.0, "step": 24 }, { "entropy": 4.533289909362793, "epoch": 1.4705882352941178, "grad_norm": Infinity, "learning_rate": 1.5000000000000002e-05, "loss": 2.87251877784729, "mean_token_accuracy": 0.5, "num_tokens": 229.0, "step": 25 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 145923548544.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }