{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 15, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.8, "completions/mean_length": 148.38466796875, "completions/mean_terminated_length": 120.88012237548828, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.064, "grad_norm": 0.016000226140022278, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 13353659.0, "reward": 0.43779296875, "reward_std": 0.30512999892234804, "rewards/accuracy_reward": 0.18955078125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.68603515625, "rewards/mean_confidence_reward": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01142578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.2, "completions/mean_length": 113.01220703125, "completions/mean_terminated_length": 102.50659942626953, "completions/min_length": 4.4, "completions/min_terminated_length": 4.4, "epoch": 0.128, "grad_norm": 0.005569960456341505, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 26368200.0, "reward": 0.5927734375, "reward_std": 0.18429518938064576, "rewards/accuracy_reward": 0.26328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.922265625, "rewards/mean_confidence_reward": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 84.0111328125, "completions/mean_terminated_length": 79.96037445068359, "completions/min_length": 9.2, "completions/min_terminated_length": 9.2, "epoch": 0.192, "grad_norm": 0.002098287222906947, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 39090618.0, "reward": 0.672998046875, "reward_std": 0.1340289294719696, "rewards/accuracy_reward": 0.36337890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9826171875, "rewards/mean_confidence_reward": 0.0, "step": 15 }, { "epoch": 0.192, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 247.5, "eval_completions/max_terminated_length": 247.5, "eval_completions/mean_length": 74.62883949279785, "eval_completions/mean_terminated_length": 74.62883949279785, "eval_completions/min_length": 22.5, "eval_completions/min_terminated_length": 22.5, "eval_loss": 0.0, "eval_num_tokens": 39090618.0, "eval_reward": 0.65625, "eval_reward_std": 0.21594678610563278, "eval_rewards/accuracy_reward": 0.314453125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.998046875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 24.5821, "eval_samples_per_second": 20.34, "eval_steps_per_second": 0.163, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013671875, "completions/max_length": 885.6, "completions/max_terminated_length": 481.4, "completions/mean_length": 75.769140625, "completions/mean_terminated_length": 74.47151794433594, "completions/min_length": 12.8, "completions/min_terminated_length": 12.8, "epoch": 0.256, "grad_norm": 0.0015658332267776132, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 51845790.0, "reward": 0.708984375, "reward_std": 0.10655935555696487, "rewards/accuracy_reward": 0.42177734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99619140625, "rewards/mean_confidence_reward": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 709.8, "completions/max_terminated_length": 506.6, "completions/mean_length": 73.475, "completions/mean_terminated_length": 73.0110580444336, "completions/min_length": 19.2, "completions/min_terminated_length": 19.2, "epoch": 0.32, "grad_norm": 0.0015233514131978154, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 64440126.0, "reward": 0.705615234375, "reward_std": 0.10088382810354232, "rewards/accuracy_reward": 0.4138671875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99736328125, "rewards/mean_confidence_reward": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 952.0, "completions/max_terminated_length": 627.2, "completions/mean_length": 76.364453125, "completions/mean_terminated_length": 75.99405670166016, "completions/min_length": 18.6, "completions/min_terminated_length": 18.6, "epoch": 0.384, "grad_norm": 0.002182575175538659, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 77054626.0, "reward": 0.708837890625, "reward_std": 0.0894511729478836, "rewards/accuracy_reward": 0.4197265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99794921875, "rewards/mean_confidence_reward": 0.0, "step": 30 }, { "epoch": 0.384, "eval_completions/clipped_ratio": 0.002155172413793094, "eval_completions/max_length": 414.5, "eval_completions/max_terminated_length": 204.75, "eval_completions/mean_length": 79.66776657104492, "eval_completions/mean_terminated_length": 77.63948631286621, "eval_completions/min_length": 31.25, "eval_completions/min_terminated_length": 31.25, "eval_loss": 0.0, "eval_num_tokens": 77054626.0, "eval_reward": 0.6640625, "eval_reward_std": 0.22485817223787308, "eval_rewards/accuracy_reward": 0.33203125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 33.0856, "eval_samples_per_second": 15.112, "eval_steps_per_second": 0.121, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 932.6, "completions/max_terminated_length": 520.6, "completions/mean_length": 80.10986328125, "completions/mean_terminated_length": 79.64854736328125, "completions/min_length": 18.6, "completions/min_terminated_length": 18.6, "epoch": 0.448, "grad_norm": 0.0010561308590695262, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 89702463.0, "reward": 0.72060546875, "reward_std": 0.08864349871873856, "rewards/accuracy_reward": 0.4431640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 800.6, "completions/max_terminated_length": 461.4, "completions/mean_length": 86.64755859375, "completions/mean_terminated_length": 86.28123168945312, "completions/min_length": 24.6, "completions/min_terminated_length": 24.6, "epoch": 0.512, "grad_norm": 0.0010048149852082133, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 102614142.0, "reward": 0.721337890625, "reward_std": 0.08699959516525269, "rewards/accuracy_reward": 0.44345703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99921875, "rewards/mean_confidence_reward": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 923.6, "completions/max_terminated_length": 530.4, "completions/mean_length": 89.90283203125, "completions/mean_terminated_length": 88.98892669677734, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "epoch": 0.576, "grad_norm": 0.0009294641204178333, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 115441355.0, "reward": 0.732666015625, "reward_std": 0.08389391750097275, "rewards/accuracy_reward": 0.4677734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99755859375, "rewards/mean_confidence_reward": 0.0, "step": 45 }, { "epoch": 0.576, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 269.5, "eval_completions/max_terminated_length": 269.5, "eval_completions/mean_length": 92.64102935791016, "eval_completions/mean_terminated_length": 92.64102935791016, "eval_completions/min_length": 35.5, "eval_completions/min_terminated_length": 35.5, "eval_loss": 0.0, "eval_num_tokens": 115441355.0, "eval_reward": 0.6865234375, "eval_reward_std": 0.23618583008646965, "eval_rewards/accuracy_reward": 0.373046875, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 26.3838, "eval_samples_per_second": 18.951, "eval_steps_per_second": 0.152, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00087890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 568.2, "completions/mean_length": 94.26240234375, "completions/mean_terminated_length": 93.44497528076172, "completions/min_length": 24.4, "completions/min_terminated_length": 24.4, "epoch": 0.64, "grad_norm": 0.0007046961691230536, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 128476370.0, "reward": 0.735302734375, "reward_std": 0.0788412094116211, "rewards/accuracy_reward": 0.47177734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998828125, "rewards/mean_confidence_reward": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 514.8, "completions/mean_length": 95.73056640625, "completions/mean_terminated_length": 94.82296295166016, "completions/min_length": 28.2, "completions/min_terminated_length": 28.2, "epoch": 0.704, "grad_norm": 0.0007479747291654348, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 141210483.0, "reward": 0.728857421875, "reward_std": 0.07646729648113251, "rewards/accuracy_reward": 0.45927734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9984375, "rewards/mean_confidence_reward": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00068359375, "completions/max_length": 896.2, "completions/max_terminated_length": 469.8, "completions/mean_length": 95.0162109375, "completions/mean_terminated_length": 94.38102111816406, "completions/min_length": 24.4, "completions/min_terminated_length": 24.4, "epoch": 0.768, "grad_norm": 0.0006520415190607309, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 154067105.0, "reward": 0.741455078125, "reward_std": 0.07960962057113648, "rewards/accuracy_reward": 0.48369140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99921875, "rewards/mean_confidence_reward": 0.0, "step": 60 }, { "epoch": 0.768, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 263.75, "eval_completions/max_terminated_length": 263.75, "eval_completions/mean_length": 96.76313209533691, "eval_completions/mean_terminated_length": 96.76313209533691, "eval_completions/min_length": 38.25, "eval_completions/min_terminated_length": 38.25, "eval_loss": 0.0, "eval_num_tokens": 154067105.0, "eval_reward": 0.69140625, "eval_reward_std": 0.23931611329317093, "eval_rewards/accuracy_reward": 0.384765625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.998046875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 25.814, "eval_samples_per_second": 19.369, "eval_steps_per_second": 0.155, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 774.0, "completions/max_terminated_length": 411.2, "completions/mean_length": 94.5296875, "completions/mean_terminated_length": 94.16671752929688, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.832, "grad_norm": 0.0007356010028161108, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 166964521.0, "reward": 0.751513671875, "reward_std": 0.07600467056035995, "rewards/accuracy_reward": 0.50341796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 443.2, "completions/mean_length": 97.67490234375, "completions/mean_terminated_length": 96.40738067626953, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.896, "grad_norm": 0.000633634568657726, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 179885808.0, "reward": 0.743115234375, "reward_std": 0.07322432547807693, "rewards/accuracy_reward": 0.48759765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986328125, "rewards/mean_confidence_reward": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 403.2, "completions/mean_length": 98.701171875, "completions/mean_terminated_length": 97.97780151367188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.96, "grad_norm": 0.0007835639989934862, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 192751292.0, "reward": 0.739453125, "reward_std": 0.07245174199342727, "rewards/accuracy_reward": 0.4796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99921875, "rewards/mean_confidence_reward": 0.0, "step": 75 }, { "epoch": 0.96, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 253.25, "eval_completions/max_terminated_length": 253.25, "eval_completions/mean_length": 96.76939582824707, "eval_completions/mean_terminated_length": 96.76939582824707, "eval_completions/min_length": 46.0, "eval_completions/min_terminated_length": 46.0, "eval_loss": 0.0, "eval_num_tokens": 192751292.0, "eval_reward": 0.7021484375, "eval_reward_std": 0.23983138427138329, "eval_rewards/accuracy_reward": 0.404296875, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 25.6674, "eval_samples_per_second": 19.48, "eval_steps_per_second": 0.156, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.6666666666666, "completions/max_terminated_length": 576.6666666666666, "completions/mean_length": 97.17679595947266, "completions/mean_terminated_length": 97.17679595947266, "completions/min_length": 32.333333333333336, "completions/min_terminated_length": 32.333333333333336, "epoch": 0.9984, "num_tokens": 200463724.0, "reward": 0.7422688802083334, "reward_std": 0.07581798732280731, "rewards/accuracy_reward": 0.4845377604166667, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 78, "total_flos": 0.0, "train_loss": 0.0031132084311535344, "train_runtime": 28707.0758, "train_samples_per_second": 0.697, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 78, "num_input_tokens_seen": 200463724, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }