{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 228.375, "epoch": 0.1, "grad_norm": 10.439698219299316, "kl": 0.0, "learning_rate": 5e-07, "loss": -0.0, "reward": 3.9033486885018647, "reward_std": 0.45634667729609646, "rewards/concensus_correctness_reward_func": 1.1147499978542328, "rewards/consensus_reward_func": 1.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.674067463260144, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.6770312478765845, "step": 2 }, { "completion_length": 157.25, "epoch": 0.2, "grad_norm": 9.67512035369873, "kl": 0.27191234956262633, "learning_rate": 4.864543104251586e-07, "loss": 0.0003, "reward": 6.323291674256325, "reward_std": 0.29528094810666516, "rewards/concensus_correctness_reward_func": 1.802937489002943, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.8601979417726398, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 1.22265625, "step": 4 }, { "completion_length": 183.59375, "epoch": 0.3, "grad_norm": 11.30721664428711, "kl": 0.7817973756464198, "learning_rate": 4.472851273490984e-07, "loss": 0.0008, "reward": 6.385719895362854, "reward_std": 0.3340341460861964, "rewards/concensus_correctness_reward_func": 1.934749998152256, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8103449335321784, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 1.171875, "step": 6 }, { "completion_length": 150.96875, "epoch": 0.4, "grad_norm": 33.876792907714844, "kl": 2.9556224551051855, "learning_rate": 3.867370395306068e-07, "loss": 0.003, "reward": 6.251800790429115, "reward_std": 0.2621385550010018, "rewards/concensus_correctness_reward_func": 1.9013749873265624, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.8647695071995258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 1.173156250268221, "step": 8 }, { "completion_length": 132.75, "epoch": 0.5, "grad_norm": 22.34031105041504, "kl": 1.533589490922168, "learning_rate": 3.1137137178519977e-07, "loss": 0.0015, "reward": 6.721142441034317, "reward_std": 0.37645364040508866, "rewards/concensus_correctness_reward_func": 2.0946249775588512, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.886954908259213, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 1.177062500268221, "step": 10 }, { "completion_length": 147.625, "epoch": 0.6, "grad_norm": 43.14989471435547, "kl": 3.1210397497052327, "learning_rate": 2.2935516363191693e-07, "loss": 0.0031, "reward": 6.697916433215141, "reward_std": 0.357076933105418, "rewards/concensus_correctness_reward_func": 1.9758749939501286, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8353225886821747, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.23046875, "step": 12 }, { "completion_length": 144.875, "epoch": 0.7, "grad_norm": 14.534745216369629, "kl": 32.27562204410788, "learning_rate": 1.4957614383675767e-07, "loss": 0.0323, "reward": 6.202968865633011, "reward_std": 0.3484449392417446, "rewards/concensus_correctness_reward_func": 1.815625011920929, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.8974376134574413, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.390625, "rewards/xmlcount_reward_func": 1.1617812514305115, "step": 14 }, { "completion_length": 157.625, "epoch": 0.8, "grad_norm": 14.83914566040039, "kl": 4.667069488205016, "learning_rate": 8.067960709356478e-08, "loss": 0.0047, "reward": 6.212416380643845, "reward_std": 0.5250872050528415, "rewards/concensus_correctness_reward_func": 1.806124985218048, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9003850966691971, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 1.0996562466025352, "step": 16 }, { "completion_length": 139.1875, "epoch": 0.9, "grad_norm": 17.672067642211914, "kl": 5.667459648102522, "learning_rate": 3.013156219837776e-08, "loss": 0.0057, "reward": 6.300769180059433, "reward_std": 0.4447646054904908, "rewards/concensus_correctness_reward_func": 1.83356249704957, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8667067214846611, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.178625002503395, "step": 18 }, { "completion_length": 138.3125, "epoch": 1.0, "grad_norm": 18.937057495117188, "kl": 7.192570003448054, "learning_rate": 3.4096741493194193e-09, "loss": 0.0072, "reward": 6.140490561723709, "reward_std": 0.5464329523674678, "rewards/concensus_correctness_reward_func": 1.854937493801117, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.8636780511587858, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.1875, "step": 20 }, { "epoch": 1.0, "step": 20, "total_flos": 0.0, "train_loss": 0.0058470307034440335, "train_runtime": 126.5029, "train_samples_per_second": 2.53, "train_steps_per_second": 0.158 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }