{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 2.691509962081909, "learning_rate": 0.00015998457923856519, "loss": 1.4456, "step": 1 }, { "epoch": 0.025, "grad_norm": 24.26286506652832, "learning_rate": 0.00015993832289925785, "loss": 4.3349, "step": 2 }, { "epoch": 0.0375, "grad_norm": 13.163036346435547, "learning_rate": 0.0001598612488147773, "loss": 2.6695, "step": 3 }, { "epoch": 0.05, "grad_norm": 9.818785667419434, "learning_rate": 0.00015975338669865026, "loss": 2.3799, "step": 4 }, { "epoch": 0.0625, "grad_norm": 6.200242519378662, "learning_rate": 0.00015961477813377576, "loss": 2.0935, "step": 5 }, { "epoch": 0.075, "grad_norm": 2.0556814670562744, "learning_rate": 0.00015944547655639412, "loss": 1.8465, "step": 6 }, { "epoch": 0.0875, "grad_norm": 2.5195746421813965, "learning_rate": 0.00015924554723548617, "loss": 1.7321, "step": 7 }, { "epoch": 0.1, "grad_norm": 4.300451278686523, "learning_rate": 0.00015901506724761103, "loss": 1.7284, "step": 8 }, { "epoch": 0.1125, "grad_norm": 1.5021892786026, "learning_rate": 0.00015875412544719134, "loss": 1.5971, "step": 9 }, { "epoch": 0.125, "grad_norm": 1.5246820449829102, "learning_rate": 0.00015846282243225845, "loss": 1.562, "step": 10 }, { "epoch": 0.1375, "grad_norm": 2.0095787048339844, "learning_rate": 0.0001581412705056698, "loss": 1.578, "step": 11 }, { "epoch": 0.15, "grad_norm": 0.9773982763290405, "learning_rate": 0.00015778959363181415, "loss": 1.4977, "step": 12 }, { "epoch": 0.1625, "grad_norm": 1.1493251323699951, "learning_rate": 0.0001574079273888208, "loss": 1.5075, "step": 13 }, { "epoch": 0.175, "grad_norm": 0.8909309506416321, "learning_rate": 0.00015699641891629178, "loss": 1.4158, "step": 14 }, { "epoch": 0.1875, "grad_norm": 0.9415439963340759, "learning_rate": 0.00015655522685857672, "loss": 1.4219, "step": 15 }, { "epoch": 0.2, "grad_norm": 1.1703603267669678, "learning_rate": 0.0001560845213036123, "loss": 1.4006, "step": 16 }, { "epoch": 0.2125, "grad_norm": 0.7575011849403381, "learning_rate": 0.00015558448371735025, "loss": 1.3675, "step": 17 }, { "epoch": 0.225, "grad_norm": 0.6772542595863342, "learning_rate": 0.00015505530687379875, "loss": 1.3369, "step": 18 }, { "epoch": 0.2375, "grad_norm": 0.5587411522865295, "learning_rate": 0.00015449719478070428, "loss": 1.3632, "step": 19 }, { "epoch": 0.25, "grad_norm": 0.5920618772506714, "learning_rate": 0.00015391036260090294, "loss": 1.3511, "step": 20 }, { "epoch": 0.2625, "grad_norm": 0.4218953847885132, "learning_rate": 0.0001532950365693709, "loss": 1.3641, "step": 21 }, { "epoch": 0.275, "grad_norm": 0.4676741361618042, "learning_rate": 0.00015265145390600652, "loss": 1.3441, "step": 22 }, { "epoch": 0.2875, "grad_norm": 0.38095250725746155, "learning_rate": 0.00015197986272417774, "loss": 1.3418, "step": 23 }, { "epoch": 0.3, "grad_norm": 0.42308753728866577, "learning_rate": 0.00015128052193506944, "loss": 1.3646, "step": 24 }, { "epoch": 0.3125, "grad_norm": 0.4307089149951935, "learning_rate": 0.0001505537011478684, "loss": 1.2992, "step": 25 }, { "epoch": 0.325, "grad_norm": 0.33103814721107483, "learning_rate": 0.0001497996805658238, "loss": 1.3435, "step": 26 }, { "epoch": 0.3375, "grad_norm": 0.3511773645877838, "learning_rate": 0.00014901875087822337, "loss": 1.3, "step": 27 }, { "epoch": 0.35, "grad_norm": 0.2914850115776062, "learning_rate": 0.0001482112131483274, "loss": 1.3103, "step": 28 }, { "epoch": 0.3625, "grad_norm": 0.37050625681877136, "learning_rate": 0.00014737737869730292, "loss": 1.2731, "step": 29 }, { "epoch": 0.375, "grad_norm": 0.3476356565952301, "learning_rate": 0.00014651756898420365, "loss": 1.3211, "step": 30 }, { "epoch": 0.3875, "grad_norm": 0.27799472212791443, "learning_rate": 0.0001456321154820411, "loss": 1.2657, "step": 31 }, { "epoch": 0.4, "grad_norm": 0.318327397108078, "learning_rate": 0.00014472135954999581, "loss": 1.3068, "step": 32 }, { "epoch": 0.4125, "grad_norm": 0.30465707182884216, "learning_rate": 0.00014378565230181657, "loss": 1.2839, "step": 33 }, { "epoch": 0.425, "grad_norm": 0.2618834376335144, "learning_rate": 0.0001428253544704596, "loss": 1.2868, "step": 34 }, { "epoch": 0.4375, "grad_norm": 0.2864656150341034, "learning_rate": 0.00014184083626901897, "loss": 1.2815, "step": 35 }, { "epoch": 0.45, "grad_norm": 0.2776831388473511, "learning_rate": 0.0001408324772480025, "loss": 1.2895, "step": 36 }, { "epoch": 0.4625, "grad_norm": 0.31238630414009094, "learning_rate": 0.00013980066614900776, "loss": 1.2718, "step": 37 }, { "epoch": 0.475, "grad_norm": 0.23365426063537598, "learning_rate": 0.00013874580075485485, "loss": 1.2596, "step": 38 }, { "epoch": 0.4875, "grad_norm": 0.23924365639686584, "learning_rate": 0.00013766828773623352, "loss": 1.2809, "step": 39 }, { "epoch": 0.5, "grad_norm": 0.24298632144927979, "learning_rate": 0.00013656854249492382, "loss": 1.2248, "step": 40 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3190320448392397e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }