{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.235294117647059, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 5.15625, "epoch": 0.23529411764705882, "grad_norm": 39.0929069519043, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0, "reward": 3.248152125161141, "reward_std": 0.17586159135680646, "rewards/concensus_correctness_reward_func": 1.5171249937266111, "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04352713652770035, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 2 }, { "completion_length": 5.65625, "epoch": 0.47058823529411764, "grad_norm": 9.405702590942383, "kl": 0.0008332367069669999, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 2.6308175306767225, "reward_std": 0.09294964651417104, "rewards/concensus_correctness_reward_func": 1.0111249908804893, "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05719255609437823, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 4 }, { "completion_length": 5.25, "epoch": 0.7058823529411765, "grad_norm": 0.005004729609936476, "kl": 0.00138925834714132, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 3.29681864293525, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.64512500166893, "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.026693683670600876, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 6 }, { "completion_length": 10.96875, "epoch": 0.9411764705882353, "grad_norm": 6.889959812164307, "kl": 0.009962221372309621, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 2.44820939283818, "reward_std": 0.09646153113862965, "rewards/concensus_correctness_reward_func": 1.089749999344349, "rewards/consensus_reward_func": 1.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04205314003047533, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00390625, "step": 8 }, { "completion_length": 13.083333333333334, "epoch": 1.1176470588235294, "grad_norm": 5.5648772104177624e-05, "kl": 0.00030983873875811696, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 2.299651576206088, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 0.5804999967416128, "rewards/consensus_reward_func": 1.6666666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05248492040360967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 10 }, { "completion_length": 5.375, "epoch": 1.3529411764705883, "grad_norm": 0.0002206077624578029, "kl": 0.0009581862177583389, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 3.1473974231630564, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.3519999906420708, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04539743281202391, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 12 }, { "completion_length": 6.25, "epoch": 1.5882352941176472, "grad_norm": 0.01640104316174984, "kl": 0.00733395929758629, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 3.1192562850774266, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.5808749962598085, "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03838133270619437, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 14 }, { "completion_length": 3.6875, "epoch": 1.8235294117647058, "grad_norm": 0.001582066877745092, "kl": 0.002636152086779475, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 3.485957680270076, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.571250006556511, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03189519251463935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0078125, "step": 16 }, { "completion_length": 4.375, "epoch": 2.0, "grad_norm": 8.508745281687879e-07, "kl": 0.06106317855624847, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.729074776172638, "reward_std": 0.009295503919323286, "rewards/concensus_correctness_reward_func": 0.6874999900658926, "rewards/consensus_reward_func": 1.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04157479809752355, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 18 }, { "completion_length": 6.0, "epoch": 2.235294117647059, "grad_norm": 0.0014920292887836695, "kl": 0.005889443103418657, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 2.7330365292727947, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.0577499866485596, "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04247402522014454, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0078125, "step": 20 }, { "epoch": 2.235294117647059, "step": 20, "total_flos": 0.0, "train_loss": 7.542508318181263e-06, "train_runtime": 1333.8259, "train_samples_per_second": 0.24, "train_steps_per_second": 0.015 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }