{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 214.0625, "epoch": 0.5714285714285714, "grad_norm": 129.29193115234375, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0, "reward": 3.2055165767669678, "reward_std": 0.3085313138435595, "rewards/concensus_correctness_reward_func": 0.7296249940991402, "rewards/consensus_reward_func": 0.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6392978364601731, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.21875, "rewards/xmlcount_reward_func": 0.7897187536582351, "step": 2 }, { "completion_length": 130.45833333333334, "epoch": 1.0, "grad_norm": 23.675512313842773, "kl": 0.07788466417696327, "learning_rate": 4.864543104251586e-07, "loss": 0.0001, "reward": 5.991452674070994, "reward_std": 0.33198659618695575, "rewards/concensus_correctness_reward_func": 1.8323333412408829, "rewards/consensus_reward_func": 1.4166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.16666666666666666, "rewards/question_recreation_reward_func": 0.9455776462952296, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1927083333333333, "step": 4 }, { "completion_length": 163.71875, "epoch": 1.5714285714285714, "grad_norm": 31.248865127563477, "kl": 0.15762662113411352, "learning_rate": 4.472851273490984e-07, "loss": 0.0002, "reward": 5.097857855260372, "reward_std": 1.0911973172042053, "rewards/concensus_correctness_reward_func": 1.5450625084340572, "rewards/consensus_reward_func": 1.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.8670765832066536, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.0763437524437904, "step": 6 }, { "completion_length": 153.45833333333334, "epoch": 2.0, "grad_norm": 59.27997970581055, "kl": 5.424358828614156, "learning_rate": 3.867370395306068e-07, "loss": 0.0041, "reward": 4.880958507458369, "reward_std": 0.7015785239636898, "rewards/concensus_correctness_reward_func": 1.1009166582177083, "rewards/consensus_reward_func": 1.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9571251372496287, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666666666667, "rewards/xmlcount_reward_func": 1.1354166666666667, "step": 8 }, { "completion_length": 159.84375, "epoch": 2.571428571428571, "grad_norm": 22.492767333984375, "kl": 3.40459524304606, "learning_rate": 3.1137137178519977e-07, "loss": 0.0034, "reward": 5.54975700378418, "reward_std": 0.4386998292757198, "rewards/concensus_correctness_reward_func": 1.5714374966919422, "rewards/consensus_reward_func": 1.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9585695303976536, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 1.1916250064969063, "step": 10 }, { "completion_length": 128.125, "epoch": 3.0, "grad_norm": 13.225279808044434, "kl": 0.43122306823109585, "learning_rate": 2.2935516363191693e-07, "loss": 0.0003, "reward": 6.302078684171041, "reward_std": 0.05914810299873352, "rewards/concensus_correctness_reward_func": 1.7683333444098632, "rewards/consensus_reward_func": 1.8333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9768286347389221, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4791666666666667, "rewards/xmlcount_reward_func": 1.2444166739781697, "step": 12 }, { "completion_length": 138.78125, "epoch": 3.571428571428571, "grad_norm": 34.572078704833984, "kl": 2.4331941911950707, "learning_rate": 1.4957614383675767e-07, "loss": 0.0024, "reward": 5.1307472586631775, "reward_std": 1.0153084722987842, "rewards/concensus_correctness_reward_func": 1.2935625012032688, "rewards/consensus_reward_func": 1.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.8988097123801708, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.2040000036358833, "step": 14 }, { "completion_length": 147.54166666666666, "epoch": 4.0, "grad_norm": 0.5938521027565002, "kl": 45.33216493514677, "learning_rate": 8.067960709356478e-08, "loss": 0.034, "reward": 5.661058286825816, "reward_std": 0.6594598864515623, "rewards/concensus_correctness_reward_func": 1.617166668176651, "rewards/consensus_reward_func": 1.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.16666666666666666, "rewards/question_recreation_reward_func": 0.9611416161060333, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333333333333, "rewards/xmlcount_reward_func": 1.186916669209798, "step": 16 }, { "completion_length": 137.96875, "epoch": 4.571428571428571, "grad_norm": 129.46636962890625, "kl": 14.352025136118755, "learning_rate": 3.013156219837776e-08, "loss": 0.0144, "reward": 5.41445254907012, "reward_std": 0.7548820912343217, "rewards/concensus_correctness_reward_func": 1.496062494814396, "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.8958275727927685, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 1.1319375038146973, "step": 18 }, { "completion_length": 150.33333333333334, "epoch": 5.0, "grad_norm": 23.762697219848633, "kl": 328.1498412267926, "learning_rate": 3.4096741493194193e-09, "loss": 0.2461, "reward": 5.554007162650426, "reward_std": 0.5768525426586469, "rewards/concensus_correctness_reward_func": 1.6576666782299678, "rewards/consensus_reward_func": 1.1666666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.16666666666666666, "rewards/question_recreation_reward_func": 0.9640488177537918, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3958333333333333, "rewards/xmlcount_reward_func": 1.203125, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 0.030490872963127913, "train_runtime": 111.1256, "train_samples_per_second": 2.88, "train_steps_per_second": 0.18 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }