{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 19.571428571428573, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 250.98684210526315, "epoch": 1.5714285714285714, "grad_norm": 35.16233825683594, "kl": 27.098115991604956, "learning_rate": 4.965903258506806e-07, "loss": 0.0322, "reward": 2.5767549056755867, "reward_std": 1.3109975865012722, "rewards/concensus_correctness_reward_func": 1.0325000160618831, "rewards/consensus_reward_func": 0.6578947368421053, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.02631578947368421, "rewards/question_recreation_reward_func": 0.25305746799628986, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05263157894736842, "rewards/xmlcount_reward_func": 0.5543552668471086, "step": 2 }, { "completion_length": 139.48863636363637, "epoch": 3.571428571428571, "grad_norm": 44.14933395385742, "kl": 7.791636250235817, "learning_rate": 4.698684378016222e-07, "loss": 0.0107, "reward": 2.973739290779287, "reward_std": 1.2925651385025545, "rewards/concensus_correctness_reward_func": 0.8166136467321352, "rewards/consensus_reward_func": 0.8863636363636364, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.022727272727272728, "rewards/question_recreation_reward_func": 0.2649438115344806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.07386363636363637, "rewards/xmlcount_reward_func": 0.9092272858728062, "step": 4 }, { "completion_length": 133.11363636363637, "epoch": 5.571428571428571, "grad_norm": 48.02810287475586, "kl": 11.512206840244206, "learning_rate": 4.193203929064353e-07, "loss": 0.0158, "reward": 2.8814289461482656, "reward_std": 1.2390722137960521, "rewards/concensus_correctness_reward_func": 0.748363643545996, "rewards/consensus_reward_func": 0.7272727272727273, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.06818181818181818, "rewards/question_recreation_reward_func": 0.29970166425813327, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.9754091013561595, "step": 6 }, { "completion_length": 141.04545454545453, "epoch": 7.571428571428571, "grad_norm": 38.969947814941406, "kl": 12.051131608811291, "learning_rate": 3.5042385616324236e-07, "loss": 0.0166, "reward": 3.381654983217066, "reward_std": 1.4018883820284496, "rewards/concensus_correctness_reward_func": 0.8803863657469099, "rewards/consensus_reward_func": 1.1136363636363635, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.06818181818181818, "rewards/question_recreation_reward_func": 0.26520041410218587, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09090909090909091, "rewards/xmlcount_reward_func": 0.9633409245447679, "step": 8 }, { "completion_length": 149.89772727272728, "epoch": 9.571428571428571, "grad_norm": 24.2377872467041, "kl": 14.045930672775615, "learning_rate": 2.706448363680831e-07, "loss": 0.0193, "reward": 3.2756326740438286, "reward_std": 1.2956370193172584, "rewards/concensus_correctness_reward_func": 0.9670681904324077, "rewards/consensus_reward_func": 0.9545454545454546, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1590909090909091, "rewards/question_recreation_reward_func": 0.2421099069443616, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.8903181905096228, "step": 10 }, { "completion_length": 141.95454545454547, "epoch": 11.571428571428571, "grad_norm": 29.485441207885742, "kl": 32.70376639745452, "learning_rate": 1.886286282148002e-07, "loss": 0.045, "reward": 3.226967532526363, "reward_std": 1.3467150957069614, "rewards/concensus_correctness_reward_func": 0.9230227328159593, "rewards/consensus_reward_func": 1.0227272727272727, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.06818181818181818, "rewards/question_recreation_reward_func": 0.3108766495504163, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.056818181818181816, "rewards/xmlcount_reward_func": 0.8453409211202101, "step": 12 }, { "completion_length": 160.9659090909091, "epoch": 13.571428571428571, "grad_norm": 248509.75, "kl": 10224.36121920022, "learning_rate": 1.1326296046939333e-07, "loss": 14.0585, "reward": 3.0833815173669294, "reward_std": 1.6619971624829553, "rewards/concensus_correctness_reward_func": 0.9169318371198394, "rewards/consensus_reward_func": 0.8863636363636364, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.045454545454545456, "rewards/question_recreation_reward_func": 0.2725861096246676, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05113636363636364, "rewards/xmlcount_reward_func": 0.9109090945937417, "step": 14 }, { "completion_length": 136.02272727272728, "epoch": 15.571428571428571, "grad_norm": 31.18145179748535, "kl": 22.365940256552264, "learning_rate": 5.271487265090163e-08, "loss": 0.0308, "reward": 3.1623240330002527, "reward_std": 1.3216449283063412, "rewards/concensus_correctness_reward_func": 0.9653409263965759, "rewards/consensus_reward_func": 0.8409090909090909, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.09090909090909091, "rewards/question_recreation_reward_func": 0.23630127632482487, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06818181818181818, "rewards/xmlcount_reward_func": 0.9606818367134441, "step": 16 }, { "completion_length": 146.86363636363637, "epoch": 17.571428571428573, "grad_norm": 39.81138229370117, "kl": 18.526999665932223, "learning_rate": 1.3545689574841341e-08, "loss": 0.0255, "reward": 3.284146102991971, "reward_std": 1.468788014894182, "rewards/concensus_correctness_reward_func": 1.177477299828421, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.045454545454545456, "rewards/question_recreation_reward_func": 0.22682786627079954, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09659090909090909, "rewards/xmlcount_reward_func": 0.9877954667264764, "step": 18 }, { "completion_length": 141.5909090909091, "epoch": 19.571428571428573, "grad_norm": 36.41402816772461, "kl": 89.27689167044379, "learning_rate": 0.0, "loss": 0.1228, "reward": 3.6040925925428215, "reward_std": 1.723052983934229, "rewards/concensus_correctness_reward_func": 1.141272744671865, "rewards/consensus_reward_func": 1.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1590909090909091, "rewards/question_recreation_reward_func": 0.28827438029375946, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.06818181818181818, "rewards/xmlcount_reward_func": 0.9472727450457487, "step": 20 }, { "epoch": 19.571428571428573, "step": 20, "total_flos": 0.0, "train_loss": 1.4377086507156491, "train_runtime": 480.0482, "train_samples_per_second": 1.333, "train_steps_per_second": 0.042 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }