6892 lines
436 KiB
JSON
6892 lines
436 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.49919376007799904,
|
|
"eval_steps": 50,
|
|
"global_step": 208,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"calibration/aurc": 0.4755090430338697,
|
|
"calibration/batch_distribution_entropy": 0.26199859861521857,
|
|
"calibration/batch_entropy_100bins": 0.3438930495423692,
|
|
"calibration/batch_entropy_10bins": 0.26199859861521857,
|
|
"calibration/batch_entropy_50bins": 0.3997214906203269,
|
|
"calibration/batch_uniqueness": 0.4832166822381069,
|
|
"calibration/confidence_entropy": 0.20919231184298712,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.0,
|
|
"calibration/coverage@15%": 0.0,
|
|
"calibration/coverage@20%": 0.0,
|
|
"calibration/coverage@25%": 0.0,
|
|
"calibration/coverage@30%": 0.0,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.26199859861521857,
|
|
"calibration/distribution_entropy_100": 0.3438930495423692,
|
|
"calibration/ece": 0.4549087624937119,
|
|
"calibration/mean_confidence": 0.9209537398939647,
|
|
"calibration/unique_confidence_per_question": 0.03177083333333333,
|
|
"calibration/unique_confidences": 12.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.019357638888888907,
|
|
"completions/max_length": 3991.8,
|
|
"completions/max_terminated_length": 3991.8,
|
|
"completions/mean_length": 515.6087646484375,
|
|
"completions/mean_terminated_length": 525.7942260742187,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.011999850001874977,
|
|
"grad_norm": 0.004724407568573952,
|
|
"learning_rate": 5.952380952380953e-07,
|
|
"loss": 0.0065,
|
|
"num_tokens": 9054021.0,
|
|
"reward": 0.5780223369598388,
|
|
"reward_std": 0.5210743069648742,
|
|
"rewards/accuracy_reward": 0.26449652314186095,
|
|
"rewards/brier_reward": 0.3152239501476288,
|
|
"rewards/confidence_uniqueness_reward": 0.2885810971260071,
|
|
"rewards/format_reward": 0.6014756917953491,
|
|
"rewards/frontier_aurc_reward": 0.27824242115020753,
|
|
"rewards/frontier_coverage_0": 0.27824242115020753,
|
|
"rewards/frontier_coverage_1": 0.27824242115020753,
|
|
"rewards/frontier_coverage_10": 0.27824242115020753,
|
|
"rewards/frontier_coverage_15": 0.27824242115020753,
|
|
"rewards/frontier_coverage_20": 0.27824242115020753,
|
|
"rewards/frontier_coverage_25": 0.27824242115020753,
|
|
"rewards/frontier_coverage_5": 0.27824242115020753,
|
|
"rewards/true_frontier_ece_gap_only_reward": 0.27824242115020753,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.31138780117034914,
|
|
"signal/accuracy_reward/group_std_mean": 0.37181236147880553,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.08055555745959282,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.15569390058517457,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.15569390058517457,
|
|
"signal/advantage_abs_mean": 0.4485911726951599,
|
|
"signal/advantage_pre_scale_abs_mean": 0.4485911726951599,
|
|
"signal/advantage_pre_scale_std": 0.5264933466911316,
|
|
"signal/advantage_std": 0.5264933466911316,
|
|
"signal/brier_reward/centered_abs_mean": 0.3207183539867401,
|
|
"signal/brier_reward/group_std_mean": 0.37424429655075075,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04008979424834251,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.04008979424834251,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.23610488772392274,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.2880967080593109,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.029513110965490343,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.029513110965490343,
|
|
"signal/format_reward/centered_abs_mean": 0.43846028447151186,
|
|
"signal/format_reward/group_std_mean": 0.4738844096660614,
|
|
"signal/format_reward/group_zero_std_frac": 0.0,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.21923014223575593,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.21923014223575593,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.3114172875881195,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.36980949640274047,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.004865895118564367,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.3114172875881195,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.36980949640274047,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.038927160948514936,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.038927160948514936,
|
|
"step": 5
|
|
},
|
|
{
|
|
"calibration/aurc": 0.5159006411390681,
|
|
"calibration/batch_distribution_entropy": 0.23845090979417666,
|
|
"calibration/batch_entropy_100bins": 0.33576880006525267,
|
|
"calibration/batch_entropy_10bins": 0.23845090979417666,
|
|
"calibration/batch_entropy_50bins": 0.3873401847509245,
|
|
"calibration/batch_uniqueness": 0.4823728144800886,
|
|
"calibration/confidence_entropy": 0.21192807755010623,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.0,
|
|
"calibration/coverage@15%": 0.0,
|
|
"calibration/coverage@20%": 0.0,
|
|
"calibration/coverage@25%": 0.0,
|
|
"calibration/coverage@30%": 0.0,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.23845090979417666,
|
|
"calibration/distribution_entropy_100": 0.33576880006525267,
|
|
"calibration/ece": 0.48419175646218493,
|
|
"calibration/mean_confidence": 0.9245204458265471,
|
|
"calibration/unique_confidence_per_question": 0.03072916666666666,
|
|
"calibration/unique_confidences": 11.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.018750000000000024,
|
|
"completions/max_length": 4070.8,
|
|
"completions/max_terminated_length": 4070.8,
|
|
"completions/mean_length": 476.8085998535156,
|
|
"completions/mean_terminated_length": 486.0776733398437,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 19.8,
|
|
"epoch": 0.023999700003749954,
|
|
"grad_norm": 0.06719768047332764,
|
|
"learning_rate": 1.1904761904761906e-06,
|
|
"loss": 0.0027,
|
|
"num_tokens": 17629576.0,
|
|
"reward": 0.6722566485404968,
|
|
"reward_std": 0.48708855509758,
|
|
"rewards/accuracy_reward": 0.29557291269302366,
|
|
"rewards/brier_reward": 0.35851759910583497,
|
|
"rewards/confidence_uniqueness_reward": 0.3507663607597351,
|
|
"rewards/format_reward": 0.7157118082046509,
|
|
"rewards/frontier_aurc_reward": 0.3118152379989624,
|
|
"rewards/frontier_coverage_0": 0.3118152379989624,
|
|
"rewards/frontier_coverage_1": 0.3118152379989624,
|
|
"rewards/frontier_coverage_10": 0.3118152379989624,
|
|
"rewards/frontier_coverage_15": 0.3118152379989624,
|
|
"rewards/frontier_coverage_20": 0.3118152379989624,
|
|
"rewards/frontier_coverage_25": 0.3118152379989624,
|
|
"rewards/frontier_coverage_5": 0.3118152379989624,
|
|
"rewards/true_frontier_ece_gap_only_reward": 0.3118152379989624,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.3207736611366272,
|
|
"signal/accuracy_reward/group_std_mean": 0.37864009737968446,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.08333333507180214,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1603868305683136,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.1603868305683136,
|
|
"signal/advantage_abs_mean": 0.4065045177936554,
|
|
"signal/advantage_pre_scale_abs_mean": 0.4065045177936554,
|
|
"signal/advantage_pre_scale_std": 0.49197044372558596,
|
|
"signal/advantage_std": 0.49197044372558596,
|
|
"signal/brier_reward/centered_abs_mean": 0.31853480339050294,
|
|
"signal/brier_reward/group_std_mean": 0.37187020778656005,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03981685042381287,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.03981685042381287,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.21813510358333588,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.2754356682300568,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.027266887947916985,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.027266887947916985,
|
|
"signal/format_reward/centered_abs_mean": 0.353564453125,
|
|
"signal/format_reward/group_std_mean": 0.41884335279464724,
|
|
"signal/format_reward/group_zero_std_frac": 0.00555555559694767,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.1767822265625,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.1767822265625,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.3167228579521179,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.37375251650810243,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.004948794655501842,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.3167228579521179,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.37375251650810243,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.03959035724401474,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.03959035724401474,
|
|
"step": 10
|
|
},
|
|
{
|
|
"calibration/aurc": 0.5224229909940956,
|
|
"calibration/batch_distribution_entropy": 0.2877403602460552,
|
|
"calibration/batch_entropy_100bins": 0.3573186157704617,
|
|
"calibration/batch_entropy_10bins": 0.2877403602460552,
|
|
"calibration/batch_entropy_50bins": 0.41523268744576436,
|
|
"calibration/batch_uniqueness": 0.5167436160191917,
|
|
"calibration/confidence_entropy": 0.23103006074957716,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.0,
|
|
"calibration/coverage@15%": 0.0,
|
|
"calibration/coverage@20%": 0.0,
|
|
"calibration/coverage@25%": 0.0,
|
|
"calibration/coverage@30%": 0.0,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.2877403602460552,
|
|
"calibration/distribution_entropy_100": 0.3573186157704617,
|
|
"calibration/ece": 0.4970179613664881,
|
|
"calibration/mean_confidence": 0.9129537058033463,
|
|
"calibration/unique_confidence_per_question": 0.036979166666666674,
|
|
"calibration/unique_confidences": 14.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.010850694444444442,
|
|
"completions/max_length": 3950.0,
|
|
"completions/max_terminated_length": 3950.0,
|
|
"completions/mean_length": 433.64210815429686,
|
|
"completions/mean_terminated_length": 438.44232177734375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 59.0,
|
|
"epoch": 0.03599955000562493,
|
|
"grad_norm": 0.0016660373657941818,
|
|
"learning_rate": 1.7857142857142859e-06,
|
|
"loss": -0.0113,
|
|
"num_tokens": 25727117.0,
|
|
"reward": 0.8379699349403381,
|
|
"reward_std": 0.3784303069114685,
|
|
"rewards/accuracy_reward": 0.32907986640930176,
|
|
"rewards/brier_reward": 0.431581848859787,
|
|
"rewards/confidence_uniqueness_reward": 0.5059985220432281,
|
|
"rewards/format_reward": 0.93359375,
|
|
"rewards/frontier_aurc_reward": 0.3577423691749573,
|
|
"rewards/frontier_coverage_0": 0.3577423691749573,
|
|
"rewards/frontier_coverage_1": 0.3577423691749573,
|
|
"rewards/frontier_coverage_10": 0.3577423691749573,
|
|
"rewards/frontier_coverage_15": 0.3577423691749573,
|
|
"rewards/frontier_coverage_20": 0.3577423691749573,
|
|
"rewards/frontier_coverage_25": 0.3577423691749573,
|
|
"rewards/frontier_coverage_5": 0.3577423691749573,
|
|
"rewards/true_frontier_ece_gap_only_reward": 0.3577423691749573,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.3203721702098846,
|
|
"signal/accuracy_reward/group_std_mean": 0.37649917006492617,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.09722222462296486,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1601860851049423,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.1601860851049423,
|
|
"signal/advantage_abs_mean": 0.30626789927482606,
|
|
"signal/advantage_pre_scale_abs_mean": 0.30626789927482606,
|
|
"signal/advantage_pre_scale_std": 0.388842511177063,
|
|
"signal/advantage_std": 0.388842511177063,
|
|
"signal/brier_reward/centered_abs_mean": 0.30102636218070983,
|
|
"signal/brier_reward/group_std_mean": 0.3518189787864685,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03762829527258873,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.03762829527258873,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.17830342054367065,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.2288795828819275,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.022287927567958832,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.022287927567958832,
|
|
"signal/format_reward/centered_abs_mean": 0.1138617604970932,
|
|
"signal/format_reward/group_std_mean": 0.19477857500314713,
|
|
"signal/format_reward/group_zero_std_frac": 0.2944444492459297,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0569308802485466,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.0569308802485466,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.31336275935173036,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.3662181556224823,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.004896293114870787,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.31336275935173036,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.3662181556224823,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.039170344918966295,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.039170344918966295,
|
|
"step": 15
|
|
},
|
|
{
|
|
"calibration/aurc": 0.44171379177515535,
|
|
"calibration/batch_distribution_entropy": 0.3765147360973443,
|
|
"calibration/batch_entropy_100bins": 0.39381071707061527,
|
|
"calibration/batch_entropy_10bins": 0.3765147360973443,
|
|
"calibration/batch_entropy_50bins": 0.4577038412907034,
|
|
"calibration/batch_uniqueness": 0.5961709299135531,
|
|
"calibration/buffer_distribution_entropy": 0.29230688761468687,
|
|
"calibration/buffer_entropy_100bins": 0.36493243936626785,
|
|
"calibration/buffer_entropy_10bins": 0.29230688761468687,
|
|
"calibration/buffer_entropy_50bins": 0.42293687132466956,
|
|
"calibration/confidence_entropy": 0.2898413619535891,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.0,
|
|
"calibration/coverage@15%": 0.0,
|
|
"calibration/coverage@20%": 0.0,
|
|
"calibration/coverage@25%": 0.034031413612565446,
|
|
"calibration/coverage@30%": 0.07905759162303665,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.3765147360973443,
|
|
"calibration/distribution_entropy_100": 0.39381071707061527,
|
|
"calibration/ece": 0.3790869053038728,
|
|
"calibration/mean_confidence": 0.8925066797565309,
|
|
"calibration/unique_confidence_per_question": 0.035416666666666666,
|
|
"calibration/unique_confidences": 13.6,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.010763888888888884,
|
|
"completions/max_length": 3739.4,
|
|
"completions/max_terminated_length": 3739.4,
|
|
"completions/mean_length": 471.2155456542969,
|
|
"completions/mean_terminated_length": 476.41375732421875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 91.8,
|
|
"epoch": 0.04799940000749991,
|
|
"grad_norm": 0.0008532739011570811,
|
|
"learning_rate": 2.380952380952381e-06,
|
|
"loss": -0.0093,
|
|
"num_tokens": 34269216.0,
|
|
"reward": 0.8856567025184632,
|
|
"reward_std": 0.2739575058221817,
|
|
"rewards/accuracy_reward": 0.4450520873069763,
|
|
"rewards/brier_reward": 0.5553683876991272,
|
|
"rewards/confidence_uniqueness_reward": 0.5914790272712708,
|
|
"rewards/format_reward": 0.9831597328186035,
|
|
"rewards/frontier_aurc_reward": 0.1789298068732023,
|
|
"rewards/frontier_coverage_0": 0.18989355927333235,
|
|
"rewards/frontier_coverage_1": 0.18989355927333235,
|
|
"rewards/frontier_coverage_10": 0.18989355927333235,
|
|
"rewards/frontier_coverage_15": 0.18989355927333235,
|
|
"rewards/frontier_coverage_20": 0.18989355927333235,
|
|
"rewards/frontier_coverage_25": 0.18989355927333235,
|
|
"rewards/frontier_coverage_5": 0.18989355927333235,
|
|
"rewards/true_frontier_ece_gap_only_reward": 0.03703599572181702,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.2997667074203491,
|
|
"signal/accuracy_reward/group_std_mean": 0.36736690402030947,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.06944444626569748,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.14988335371017455,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.14988335371017455,
|
|
"signal/advantage_abs_mean": 0.21930149793624878,
|
|
"signal/advantage_pre_scale_abs_mean": 0.21930149793624878,
|
|
"signal/advantage_pre_scale_std": 0.28236431181430816,
|
|
"signal/advantage_std": 0.28236431181430816,
|
|
"signal/brier_reward/centered_abs_mean": 0.2637813687324524,
|
|
"signal/brier_reward/group_std_mean": 0.3207400619983673,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03297267109155655,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.03297267109155655,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1622892886400223,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.19781720638275146,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.020286161080002786,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.020286161080002786,
|
|
"signal/format_reward/centered_abs_mean": 0.03038194477558136,
|
|
"signal/format_reward/group_std_mean": 0.06310995742678642,
|
|
"signal/format_reward/group_zero_std_frac": 0.7222222328186035,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01519097238779068,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.01519097238779068,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.11971323965117335,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.14516795333474874,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.0018705193695495836,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.0018705193695495836,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.13567787148058413,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1734710790216923,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.13567787148058413,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1734710790216923,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.13567787148058413,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1734710790216923,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.13567787148058413,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.1734710790216923,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.13567787148058413,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.1734710790216923,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.13567787148058413,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.1734710790216923,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.13567787148058413,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1734710790216923,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.002119966741884127,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.1349403366446495,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.16668230146169663,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.016867542080581187,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.016867542080581187,
|
|
"step": 20
|
|
},
|
|
{
|
|
"calibration/aurc": 0.3429455132381247,
|
|
"calibration/batch_distribution_entropy": 0.507851007126862,
|
|
"calibration/batch_entropy_100bins": 0.44183174453138746,
|
|
"calibration/batch_entropy_10bins": 0.507851007126862,
|
|
"calibration/batch_entropy_50bins": 0.514838438261066,
|
|
"calibration/batch_uniqueness": 0.6844458142688816,
|
|
"calibration/buffer_distribution_entropy": 0.3345262025303308,
|
|
"calibration/buffer_entropy_100bins": 0.3853994626063969,
|
|
"calibration/buffer_entropy_10bins": 0.3345262025303308,
|
|
"calibration/buffer_entropy_50bins": 0.44640286826484654,
|
|
"calibration/confidence_entropy": 0.34430819143240965,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.0,
|
|
"calibration/coverage@15%": 0.05654450261780105,
|
|
"calibration/coverage@20%": 0.09424083769633508,
|
|
"calibration/coverage@25%": 0.22486700447763291,
|
|
"calibration/coverage@30%": 0.43646112600536197,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.507851007126862,
|
|
"calibration/distribution_entropy_100": 0.44183174453138746,
|
|
"calibration/ece": 0.26461993450442634,
|
|
"calibration/mean_confidence": 0.8628734329706985,
|
|
"calibration/unique_confidence_per_question": 0.0421875,
|
|
"calibration/unique_confidences": 16.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.009895833333333348,
|
|
"completions/max_length": 4003.6,
|
|
"completions/max_terminated_length": 4003.6,
|
|
"completions/mean_length": 524.5051208496094,
|
|
"completions/mean_terminated_length": 529.7198486328125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 107.6,
|
|
"epoch": 0.05999925000937488,
|
|
"grad_norm": 0.001525247236713767,
|
|
"learning_rate": 2.9761904761904763e-06,
|
|
"loss": -0.0065,
|
|
"num_tokens": 43435963.0,
|
|
"reward": 0.9095749855041504,
|
|
"reward_std": 0.21235645115375518,
|
|
"rewards/accuracy_reward": 0.5509548485279083,
|
|
"rewards/brier_reward": 0.6560544490814209,
|
|
"rewards/confidence_uniqueness_reward": 0.6728395104408265,
|
|
"rewards/format_reward": 0.9865451335906983,
|
|
"rewards/frontier_aurc_reward": -0.004245653562247753,
|
|
"rewards/frontier_coverage_0": 0.003410888835787773,
|
|
"rewards/frontier_coverage_1": 0.003410888835787773,
|
|
"rewards/frontier_coverage_10": 0.003410888835787773,
|
|
"rewards/frontier_coverage_15": 0.003410888835787773,
|
|
"rewards/frontier_coverage_20": 0.003410888835787773,
|
|
"rewards/frontier_coverage_25": 0.003410888835787773,
|
|
"rewards/frontier_coverage_5": 0.003410888835787773,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.20474808514118195,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.26726887822151185,
|
|
"signal/accuracy_reward/group_std_mean": 0.333760267496109,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.12777777910232543,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.13363443911075593,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.13363443911075593,
|
|
"signal/advantage_abs_mean": 0.1648347020149231,
|
|
"signal/advantage_pre_scale_abs_mean": 0.1648347020149231,
|
|
"signal/advantage_pre_scale_std": 0.22817236185073853,
|
|
"signal/advantage_std": 0.22817236185073853,
|
|
"signal/brier_reward/centered_abs_mean": 0.21432596445083618,
|
|
"signal/brier_reward/group_std_mean": 0.2680306822061539,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.026790745556354523,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.026790745556354523,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.12359119206666946,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.15292936861515044,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.015448899008333683,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.015448899008333683,
|
|
"signal/format_reward/centered_abs_mean": 0.024397786147892474,
|
|
"signal/format_reward/group_std_mean": 0.051703880354762075,
|
|
"signal/format_reward/group_zero_std_frac": 0.7694444537162781,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.012198893073946237,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.012198893073946237,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0031622422859072684,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.004759848862886429,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.941003571730107e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.941003571730107e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.03383462205529213,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.0554510623216629,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.03383462205529213,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.0554510623216629,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.03383462205529213,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.0554510623216629,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.03383462205529213,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.0554510623216629,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.03383462205529213,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.0554510623216629,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.03383462205529213,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.0554510623216629,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.03383462205529213,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.0554510623216629,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0005286659696139395,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.03566240519285202,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.04645907133817673,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0044578006491065025,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0044578006491065025,
|
|
"step": 25
|
|
},
|
|
{
|
|
"calibration/aurc": 0.3084429936468077,
|
|
"calibration/batch_distribution_entropy": 0.6357372712721332,
|
|
"calibration/batch_entropy_100bins": 0.4579116256178331,
|
|
"calibration/batch_entropy_10bins": 0.6357372712721332,
|
|
"calibration/batch_entropy_50bins": 0.5364450128476048,
|
|
"calibration/batch_uniqueness": 0.7160179009317768,
|
|
"calibration/buffer_distribution_entropy": 0.4059973934246511,
|
|
"calibration/buffer_entropy_100bins": 0.4151252075471626,
|
|
"calibration/buffer_entropy_10bins": 0.4059973934246511,
|
|
"calibration/buffer_entropy_50bins": 0.4819283775367138,
|
|
"calibration/confidence_entropy": 0.46384456781483224,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.005759162303664921,
|
|
"calibration/coverage@15%": 0.04192937506962237,
|
|
"calibration/coverage@20%": 0.04298485017266347,
|
|
"calibration/coverage@25%": 0.14497929495647485,
|
|
"calibration/coverage@30%": 0.48877619011061546,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.6357372712721332,
|
|
"calibration/distribution_entropy_100": 0.4579116256178331,
|
|
"calibration/ece": 0.1707801967802031,
|
|
"calibration/mean_confidence": 0.7917407206813284,
|
|
"calibration/unique_confidence_per_question": 0.0359375,
|
|
"calibration/unique_confidences": 13.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015711805555555534,
|
|
"completions/max_length": 4078.4,
|
|
"completions/max_terminated_length": 4078.4,
|
|
"completions/mean_length": 598.9315185546875,
|
|
"completions/mean_terminated_length": 608.5085327148438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 125.6,
|
|
"epoch": 0.07199910001124986,
|
|
"grad_norm": 0.0004900748026557267,
|
|
"learning_rate": 3.5714285714285718e-06,
|
|
"loss": -0.0094,
|
|
"num_tokens": 53445574.0,
|
|
"reward": 0.9443552374839783,
|
|
"reward_std": 0.1934140741825104,
|
|
"rewards/accuracy_reward": 0.5914930582046509,
|
|
"rewards/brier_reward": 0.7071029067039489,
|
|
"rewards/confidence_uniqueness_reward": 0.6907591581344604,
|
|
"rewards/format_reward": 0.9809895753860474,
|
|
"rewards/frontier_aurc_reward": -0.0032980738673359157,
|
|
"rewards/frontier_coverage_0": -0.006392185157164931,
|
|
"rewards/frontier_coverage_1": -0.006392185157164931,
|
|
"rewards/frontier_coverage_10": -0.006392185157164931,
|
|
"rewards/frontier_coverage_15": -0.006392185157164931,
|
|
"rewards/frontier_coverage_20": -0.006392185157164931,
|
|
"rewards/frontier_coverage_25": -0.006392185157164931,
|
|
"rewards/frontier_coverage_5": -0.006392185157164931,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.1269455760717392,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.23853081464767456,
|
|
"signal/accuracy_reward/group_std_mean": 0.2985024094581604,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.21111111491918563,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.11926540732383728,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.11926540732383728,
|
|
"signal/advantage_abs_mean": 0.14833838045597075,
|
|
"signal/advantage_pre_scale_abs_mean": 0.14833838045597075,
|
|
"signal/advantage_pre_scale_std": 0.21972199380397797,
|
|
"signal/advantage_std": 0.21972199380397797,
|
|
"signal/brier_reward/centered_abs_mean": 0.17149352431297302,
|
|
"signal/brier_reward/group_std_mean": 0.21699302196502684,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.021436690539121627,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.021436690539121627,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.11525630950927734,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.14727450013160706,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.014407038688659668,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.014407038688659668,
|
|
"signal/format_reward/centered_abs_mean": 0.03138563372194767,
|
|
"signal/format_reward/group_std_mean": 0.06032953634858131,
|
|
"signal/format_reward/group_zero_std_frac": 0.7444444537162781,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.015692816860973836,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.015692816860973836,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.001896983222104609,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0030401549767702816,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.9640362845384517e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.9640362845384517e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.04802608713507652,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.06869390532374382,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.04802608713507652,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.06869390532374382,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.04802608713507652,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.06869390532374382,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.04802608713507652,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.06869390532374382,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04802608713507652,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.06869390532374382,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.04802608713507652,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.06869390532374382,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.04802608713507652,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.06869390532374382,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0007504076114855706,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.03984055146574974,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.0505749449133873,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.004980068933218717,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.004980068933218717,
|
|
"step": 30
|
|
},
|
|
{
|
|
"calibration/aurc": 0.250599098148718,
|
|
"calibration/batch_distribution_entropy": 0.6323867153697071,
|
|
"calibration/batch_entropy_100bins": 0.42821853049324937,
|
|
"calibration/batch_entropy_10bins": 0.6323867153697071,
|
|
"calibration/batch_entropy_50bins": 0.5033903372089894,
|
|
"calibration/batch_uniqueness": 0.6367832585438589,
|
|
"calibration/buffer_distribution_entropy": 0.5053599848783283,
|
|
"calibration/buffer_entropy_100bins": 0.45108017532774436,
|
|
"calibration/buffer_entropy_10bins": 0.5053599848783283,
|
|
"calibration/buffer_entropy_50bins": 0.5249452336686906,
|
|
"calibration/confidence_entropy": 0.5795754506284359,
|
|
"calibration/coverage@0%": 0.01114940403252757,
|
|
"calibration/coverage@1%": 0.01114940403252757,
|
|
"calibration/coverage@10%": 0.023336877784522418,
|
|
"calibration/coverage@15%": 0.045042137534781396,
|
|
"calibration/coverage@20%": 0.21714213102924745,
|
|
"calibration/coverage@25%": 0.5910950020422562,
|
|
"calibration/coverage@30%": 0.8375,
|
|
"calibration/coverage@5%": 0.01114940403252757,
|
|
"calibration/distribution_entropy_10": 0.6323867153697071,
|
|
"calibration/distribution_entropy_100": 0.42821853049324937,
|
|
"calibration/ece": 0.10070792051637412,
|
|
"calibration/mean_confidence": 0.6918897482432823,
|
|
"calibration/unique_confidence_per_question": 0.0375,
|
|
"calibration/unique_confidences": 14.4,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.017534722222222233,
|
|
"completions/max_length": 4016.6,
|
|
"completions/max_terminated_length": 4016.6,
|
|
"completions/mean_length": 656.1796997070312,
|
|
"completions/mean_terminated_length": 667.9824096679688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 189.6,
|
|
"epoch": 0.08399895001312484,
|
|
"grad_norm": 0.0004635561490431428,
|
|
"learning_rate": 4.166666666666667e-06,
|
|
"loss": -0.0114,
|
|
"num_tokens": 64082204.0,
|
|
"reward": 0.9670976400375366,
|
|
"reward_std": 0.17328265607357024,
|
|
"rewards/accuracy_reward": 0.6295138835906983,
|
|
"rewards/brier_reward": 0.7479526400566101,
|
|
"rewards/confidence_uniqueness_reward": 0.6298671245574952,
|
|
"rewards/format_reward": 0.9805555582046509,
|
|
"rewards/frontier_aurc_reward": -0.002723962301388383,
|
|
"rewards/frontier_coverage_0": -0.023209616425447166,
|
|
"rewards/frontier_coverage_1": -0.023209616425447166,
|
|
"rewards/frontier_coverage_10": -0.023209616425447166,
|
|
"rewards/frontier_coverage_15": -0.023209616425447166,
|
|
"rewards/frontier_coverage_20": -0.023209616425447166,
|
|
"rewards/frontier_coverage_25": -0.023209616425447166,
|
|
"rewards/frontier_coverage_5": -0.023209616425447166,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.060667777061462404,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.21026475727558136,
|
|
"signal/accuracy_reward/group_std_mean": 0.26806623935699464,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.2722222238779068,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.10513237863779068,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.10513237863779068,
|
|
"signal/advantage_abs_mean": 0.13171991258859633,
|
|
"signal/advantage_pre_scale_abs_mean": 0.13171991258859633,
|
|
"signal/advantage_pre_scale_std": 0.19843848645687104,
|
|
"signal/advantage_std": 0.19843848645687104,
|
|
"signal/brier_reward/centered_abs_mean": 0.12574937492609023,
|
|
"signal/brier_reward/group_std_mean": 0.16387327909469604,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01571867186576128,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01571867186576128,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.17439252138137817,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.20724063515663146,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02179906517267227,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.02179906517267227,
|
|
"signal/format_reward/centered_abs_mean": 0.03100043386220932,
|
|
"signal/format_reward/group_std_mean": 0.055044320225715634,
|
|
"signal/format_reward/group_zero_std_frac": 0.7833333492279053,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01550021693110466,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.01550021693110466,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0009950165753252805,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.001629676064476371,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.5547133989457508e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.5547133989457508e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.07657658159732819,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.09882448017597198,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.07657658159732819,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.09882448017597198,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.07657658159732819,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.09882448017597198,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.07657658159732819,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.09882448017597198,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.07657658159732819,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.09882448017597198,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.07657658159732819,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.09882448017597198,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.07657658159732819,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.09882448017597198,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001196509087458253,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.030044597759842872,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.04015489742159843,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.003755574719980359,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.003755574719980359,
|
|
"step": 35
|
|
},
|
|
{
|
|
"calibration/aurc": 0.33153256701336514,
|
|
"calibration/batch_distribution_entropy": 0.5854273557012318,
|
|
"calibration/batch_entropy_100bins": 0.4253124440379624,
|
|
"calibration/batch_entropy_10bins": 0.5854273557012318,
|
|
"calibration/batch_entropy_50bins": 0.4994252045234801,
|
|
"calibration/batch_uniqueness": 0.6444065784609718,
|
|
"calibration/buffer_distribution_entropy": 0.5848560770733751,
|
|
"calibration/buffer_entropy_100bins": 0.4806630298602917,
|
|
"calibration/buffer_entropy_10bins": 0.5848560770733751,
|
|
"calibration/buffer_entropy_50bins": 0.5604947004501042,
|
|
"calibration/confidence_entropy": 0.6171869593013805,
|
|
"calibration/coverage@0%": 0.006288407488631675,
|
|
"calibration/coverage@1%": 0.006288407488631675,
|
|
"calibration/coverage@10%": 0.006288407488631675,
|
|
"calibration/coverage@15%": 0.020387624198814444,
|
|
"calibration/coverage@20%": 0.020387624198814444,
|
|
"calibration/coverage@25%": 0.22147370367411084,
|
|
"calibration/coverage@30%": 0.2721291791843428,
|
|
"calibration/coverage@5%": 0.006288407488631675,
|
|
"calibration/distribution_entropy_10": 0.5854273557012318,
|
|
"calibration/distribution_entropy_100": 0.4253124440379624,
|
|
"calibration/ece": 0.09023499731095522,
|
|
"calibration/mean_confidence": 0.6499715924246285,
|
|
"calibration/unique_confidence_per_question": 0.03958333333333333,
|
|
"calibration/unique_confidences": 15.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.012760416666666653,
|
|
"completions/max_length": 3917.2,
|
|
"completions/max_terminated_length": 3917.2,
|
|
"completions/mean_length": 703.5712768554688,
|
|
"completions/mean_terminated_length": 712.6527587890625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 197.0,
|
|
"epoch": 0.09599880001499982,
|
|
"grad_norm": 0.0004154318303335458,
|
|
"learning_rate": 4.761904761904762e-06,
|
|
"loss": -0.0103,
|
|
"num_tokens": 75306865.0,
|
|
"reward": 0.9813725113868713,
|
|
"reward_std": 0.15861513316631318,
|
|
"rewards/accuracy_reward": 0.6460069417953491,
|
|
"rewards/brier_reward": 0.7578154802322388,
|
|
"rewards/confidence_uniqueness_reward": 0.629870867729187,
|
|
"rewards/format_reward": 0.985156238079071,
|
|
"rewards/frontier_aurc_reward": -0.002467139856889844,
|
|
"rewards/frontier_coverage_0": -0.03592981658875942,
|
|
"rewards/frontier_coverage_1": -0.03592981658875942,
|
|
"rewards/frontier_coverage_10": -0.03592981658875942,
|
|
"rewards/frontier_coverage_15": -0.03592981658875942,
|
|
"rewards/frontier_coverage_20": -0.03592981658875942,
|
|
"rewards/frontier_coverage_25": -0.03592981658875942,
|
|
"rewards/frontier_coverage_5": -0.03592981658875942,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.029612084105610847,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.19372829794883728,
|
|
"signal/accuracy_reward/group_std_mean": 0.2543206661939621,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.286111119389534,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09686414897441864,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09686414897441864,
|
|
"signal/advantage_abs_mean": 0.11751253008842469,
|
|
"signal/advantage_pre_scale_abs_mean": 0.11751253008842469,
|
|
"signal/advantage_pre_scale_std": 0.186165389418602,
|
|
"signal/advantage_std": 0.186165389418602,
|
|
"signal/brier_reward/centered_abs_mean": 0.11083936840295791,
|
|
"signal/brier_reward/group_std_mean": 0.14510888755321502,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013854921050369739,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013854921050369739,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1665905848145485,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.1974548101425171,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.020823823101818562,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.020823823101818562,
|
|
"signal/format_reward/centered_abs_mean": 0.02540690116584301,
|
|
"signal/format_reward/group_std_mean": 0.04733345359563827,
|
|
"signal/format_reward/group_zero_std_frac": 0.8055555701255799,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.012703450582921504,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.012703450582921504,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0008696626755408943,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0013225122122094036,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.3588479305326473e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.3588479305326473e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.09154021292924881,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.11824491173028946,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.09154021292924881,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.11824491173028946,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.09154021292924881,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.11824491173028946,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09154021292924881,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.11824491173028946,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.09154021292924881,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.11824491173028946,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.09154021292924881,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.11824491173028946,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.09154021292924881,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.11824491173028946,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0014303158270195127,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.02095247954130173,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.03012901544570923,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.002619059942662716,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.002619059942662716,
|
|
"step": 40
|
|
},
|
|
{
|
|
"calibration/aurc": 0.20306210885118028,
|
|
"calibration/batch_distribution_entropy": 0.6887549737306013,
|
|
"calibration/batch_entropy_100bins": 0.4745983011730009,
|
|
"calibration/batch_entropy_10bins": 0.6887549737306013,
|
|
"calibration/batch_entropy_50bins": 0.5543744512155419,
|
|
"calibration/batch_uniqueness": 0.6833862362276909,
|
|
"calibration/buffer_distribution_entropy": 0.6364236887118315,
|
|
"calibration/buffer_entropy_100bins": 0.5046055763785582,
|
|
"calibration/buffer_entropy_10bins": 0.6364236887118315,
|
|
"calibration/buffer_entropy_50bins": 0.5889470060117332,
|
|
"calibration/confidence_entropy": 0.5883243712866314,
|
|
"calibration/coverage@0%": 0.014789383258954939,
|
|
"calibration/coverage@1%": 0.014789383258954939,
|
|
"calibration/coverage@10%": 0.057257313784553766,
|
|
"calibration/coverage@15%": 0.1596987720979517,
|
|
"calibration/coverage@20%": 0.4277598387998176,
|
|
"calibration/coverage@25%": 0.8117097398897354,
|
|
"calibration/coverage@30%": 0.9646739130434783,
|
|
"calibration/coverage@5%": 0.014789383258954939,
|
|
"calibration/distribution_entropy_10": 0.6887549737306013,
|
|
"calibration/distribution_entropy_100": 0.4745983011730009,
|
|
"calibration/ece": 0.07153040992310632,
|
|
"calibration/mean_confidence": 0.6690040608217023,
|
|
"calibration/unique_confidence_per_question": 0.06197916666666666,
|
|
"calibration/unique_confidences": 23.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015104166666666651,
|
|
"completions/max_length": 3774.0,
|
|
"completions/max_terminated_length": 3774.0,
|
|
"completions/mean_length": 735.8694580078125,
|
|
"completions/mean_terminated_length": 747.207177734375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 239.4,
|
|
"epoch": 0.1079986500168748,
|
|
"grad_norm": 0.0004037077887915075,
|
|
"learning_rate": 4.909638554216868e-06,
|
|
"loss": -0.0114,
|
|
"num_tokens": 86919345.0,
|
|
"reward": 0.9962880134582519,
|
|
"reward_std": 0.15157434940338135,
|
|
"rewards/accuracy_reward": 0.659375,
|
|
"rewards/brier_reward": 0.7679201841354371,
|
|
"rewards/confidence_uniqueness_reward": 0.677590298652649,
|
|
"rewards/format_reward": 0.9844618201255798,
|
|
"rewards/frontier_aurc_reward": -0.002255662181414664,
|
|
"rewards/frontier_coverage_0": -0.028669605404138564,
|
|
"rewards/frontier_coverage_1": -0.028669605404138564,
|
|
"rewards/frontier_coverage_10": -0.028669605404138564,
|
|
"rewards/frontier_coverage_15": -0.028669605404138564,
|
|
"rewards/frontier_coverage_20": -0.028669605404138564,
|
|
"rewards/frontier_coverage_25": -0.028669605404138564,
|
|
"rewards/frontier_coverage_5": -0.028669605404138564,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.02518573999404907,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1809027761220932,
|
|
"signal/accuracy_reward/group_std_mean": 0.24004943072795867,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.31666667461395265,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0904513880610466,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0904513880610466,
|
|
"signal/advantage_abs_mean": 0.11121969670057297,
|
|
"signal/advantage_pre_scale_abs_mean": 0.11121969670057297,
|
|
"signal/advantage_pre_scale_std": 0.1811255246400833,
|
|
"signal/advantage_std": 0.1811255246400833,
|
|
"signal/brier_reward/centered_abs_mean": 0.11833977550268174,
|
|
"signal/brier_reward/group_std_mean": 0.1528250217437744,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014792471937835217,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014792471937835217,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.15492647886276245,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.18688772320747377,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.019365809857845306,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.019365809857845306,
|
|
"signal/format_reward/centered_abs_mean": 0.02518988773226738,
|
|
"signal/format_reward/group_std_mean": 0.04387797862291336,
|
|
"signal/format_reward/group_zero_std_frac": 0.8305555582046509,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01259494386613369,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.01259494386613369,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0012202380341477693,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.001992561621591449,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.9066219283558895e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.9066219283558895e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.09129920750856399,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.12056645601987839,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.09129920750856399,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.12056645601987839,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.09129920750856399,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.12056645601987839,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09129920750856399,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.12056645601987839,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.09129920750856399,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.12056645601987839,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.09129920750856399,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.12056645601987839,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.09129920750856399,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.12056645601987839,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0014265501173213123,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.021791164949536323,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.03154192678630352,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0027238956186920404,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0027238956186920404,
|
|
"step": 45
|
|
},
|
|
{
|
|
"calibration/aurc": 0.39036076960907307,
|
|
"calibration/batch_distribution_entropy": 0.7862094401826989,
|
|
"calibration/batch_entropy_100bins": 0.6110341298979247,
|
|
"calibration/batch_entropy_10bins": 0.7862094401826989,
|
|
"calibration/batch_entropy_50bins": 0.6934297857057619,
|
|
"calibration/batch_uniqueness": 0.8247381911119701,
|
|
"calibration/buffer_distribution_entropy": 0.6734957884749172,
|
|
"calibration/buffer_entropy_100bins": 0.5282027696652414,
|
|
"calibration/buffer_entropy_10bins": 0.6734957884749172,
|
|
"calibration/buffer_entropy_50bins": 0.6156049219048293,
|
|
"calibration/confidence_entropy": 0.5949417037140304,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.0,
|
|
"calibration/coverage@15%": 0.003655352480417755,
|
|
"calibration/coverage@20%": 0.03593048475555003,
|
|
"calibration/coverage@25%": 0.0391050879301532,
|
|
"calibration/coverage@30%": 0.2087096269843971,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.7862094401826989,
|
|
"calibration/distribution_entropy_100": 0.6110341298979247,
|
|
"calibration/ece": 0.12655827505964012,
|
|
"calibration/mean_confidence": 0.6225528034108609,
|
|
"calibration/unique_confidence_per_question": 0.10677083333333334,
|
|
"calibration/unique_confidences": 41.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.011979166666666652,
|
|
"completions/max_length": 3659.4,
|
|
"completions/max_terminated_length": 3659.4,
|
|
"completions/mean_length": 737.67744140625,
|
|
"completions/mean_terminated_length": 746.5686889648438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 213.2,
|
|
"epoch": 0.11999850001874976,
|
|
"grad_norm": 0.0004018676117993891,
|
|
"learning_rate": 4.759036144578314e-06,
|
|
"loss": -0.0112,
|
|
"num_tokens": 98514989.0,
|
|
"reward": 1.000908660888672,
|
|
"reward_std": 0.14541010558605194,
|
|
"rewards/accuracy_reward": 0.6365451455116272,
|
|
"rewards/brier_reward": 0.7578566431999206,
|
|
"rewards/confidence_uniqueness_reward": 0.8036070704460144,
|
|
"rewards/format_reward": 0.9878472208976745,
|
|
"rewards/frontier_aurc_reward": -0.0020980457309633495,
|
|
"rewards/frontier_coverage_0": -0.028162200190126895,
|
|
"rewards/frontier_coverage_1": -0.028162200190126895,
|
|
"rewards/frontier_coverage_10": -0.028162200190126895,
|
|
"rewards/frontier_coverage_15": -0.028162200190126895,
|
|
"rewards/frontier_coverage_20": -0.028162200190126895,
|
|
"rewards/frontier_coverage_25": -0.028162200190126895,
|
|
"rewards/frontier_coverage_5": -0.028162200190126895,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.026859960705041885,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1797797292470932,
|
|
"signal/accuracy_reward/group_std_mean": 0.2389494448900223,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.3166666805744171,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0898898646235466,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0898898646235466,
|
|
"signal/advantage_abs_mean": 0.105811907351017,
|
|
"signal/advantage_pre_scale_abs_mean": 0.105811907351017,
|
|
"signal/advantage_pre_scale_std": 0.1739170879125595,
|
|
"signal/advantage_std": 0.1739170879125595,
|
|
"signal/brier_reward/centered_abs_mean": 0.12420935779809952,
|
|
"signal/brier_reward/group_std_mean": 0.16102492213249206,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01552616972476244,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01552616972476244,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.10721739381551743,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.13165029883384705,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.013402174226939678,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.013402174226939678,
|
|
"signal/format_reward/centered_abs_mean": 0.02109375,
|
|
"signal/format_reward/group_std_mean": 0.03988752476871014,
|
|
"signal/format_reward/group_zero_std_frac": 0.8361111164093018,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.010546875,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.010546875,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0016509333858266474,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0027218869887292384,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.5795834153541365e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.5795834153541365e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.11604345738887786,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1531293898820877,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.11604345738887786,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1531293898820877,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.11604345738887786,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1531293898820877,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.11604345738887786,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.1531293898820877,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.11604345738887786,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.1531293898820877,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.11604345738887786,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.1531293898820877,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.11604345738887786,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1531293898820877,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0018131790217012166,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.02609681598842144,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.03738295584917069,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00326210199855268,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00326210199855268,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.11999850001874976,
|
|
"eval_completions/clipped_ratio": 0.013888888888888895,
|
|
"eval_completions/max_length": 1763.6666666666667,
|
|
"eval_completions/max_terminated_length": 1763.6666666666667,
|
|
"eval_completions/mean_length": 710.3668518066406,
|
|
"eval_completions/mean_terminated_length": 720.3878885904948,
|
|
"eval_completions/min_length": 65.33333333333333,
|
|
"eval_completions/min_terminated_length": 283.0,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 98514989.0,
|
|
"eval_reward": 1.0152363975842793,
|
|
"eval_reward_std": 0.25018754849831265,
|
|
"eval_rewards/accuracy_reward": 0.671875,
|
|
"eval_rewards/brier_reward": 0.7494580149650574,
|
|
"eval_rewards/confidence_uniqueness_reward": 0.8206921716531118,
|
|
"eval_rewards/format_reward": 0.984375,
|
|
"eval_rewards/frontier_aurc_reward": -0.0020469005879325173,
|
|
"eval_rewards/frontier_coverage_0": -0.05519990002115568,
|
|
"eval_rewards/frontier_coverage_1": -0.05519990002115568,
|
|
"eval_rewards/frontier_coverage_10": -0.05519990002115568,
|
|
"eval_rewards/frontier_coverage_15": -0.05519990002115568,
|
|
"eval_rewards/frontier_coverage_20": -0.05519990002115568,
|
|
"eval_rewards/frontier_coverage_25": -0.05519990002115568,
|
|
"eval_rewards/frontier_coverage_5": -0.05519990002115568,
|
|
"eval_rewards/true_frontier_ece_gap_only_reward": -0.024703877978026867,
|
|
"eval_runtime": 205.888,
|
|
"eval_samples_per_second": 4.857,
|
|
"eval_signal/accuracy_reward/centered_abs_mean": 0.4297960052887599,
|
|
"eval_signal/accuracy_reward/group_std_mean": 0.4702196568250656,
|
|
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21489800264437994,
|
|
"eval_signal/accuracy_reward/weight": 0.5,
|
|
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21489800264437994,
|
|
"eval_signal/advantage_abs_mean": 0.21398618072271347,
|
|
"eval_signal/advantage_pre_scale_abs_mean": 0.21398618072271347,
|
|
"eval_signal/advantage_pre_scale_std": 0.2496974691748619,
|
|
"eval_signal/advantage_std": 0.2496974691748619,
|
|
"eval_signal/brier_reward/centered_abs_mean": 0.14907778551181158,
|
|
"eval_signal/brier_reward/group_std_mean": 0.20007833590110144,
|
|
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.018634723188976448,
|
|
"eval_signal/brier_reward/weight": 0.125,
|
|
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.018634723188976448,
|
|
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.09062439575791359,
|
|
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.12857400501767793,
|
|
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.011328049469739199,
|
|
"eval_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.011328049469739199,
|
|
"eval_signal/format_reward/centered_abs_mean": 0.029947916977107525,
|
|
"eval_signal/format_reward/group_std_mean": 0.0794201207657655,
|
|
"eval_signal/format_reward/group_zero_std_frac": 0.5833333432674408,
|
|
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.014973958488553762,
|
|
"eval_signal/format_reward/weight": 0.5,
|
|
"eval_signal/format_reward/weighted_centered_abs_mean": 0.014973958488553762,
|
|
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.0015480444611360629,
|
|
"eval_signal/frontier_aurc_reward/group_std_mean": 0.003234441547344128,
|
|
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.4188194705250982e-05,
|
|
"eval_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.4188194705250982e-05,
|
|
"eval_signal/frontier_coverage_0/centered_abs_mean": 0.20009989539782205,
|
|
"eval_signal/frontier_coverage_0/group_std_mean": 0.26012368500232697,
|
|
"eval_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_0/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_1/centered_abs_mean": 0.20009989539782205,
|
|
"eval_signal/frontier_coverage_1/group_std_mean": 0.26012368500232697,
|
|
"eval_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_1/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_10/centered_abs_mean": 0.20009989539782205,
|
|
"eval_signal/frontier_coverage_10/group_std_mean": 0.26012368500232697,
|
|
"eval_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_10/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_15/centered_abs_mean": 0.20009989539782205,
|
|
"eval_signal/frontier_coverage_15/group_std_mean": 0.26012368500232697,
|
|
"eval_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_15/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_20/centered_abs_mean": 0.20009989539782205,
|
|
"eval_signal/frontier_coverage_20/group_std_mean": 0.26012368500232697,
|
|
"eval_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_20/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_25/centered_abs_mean": 0.20009989539782205,
|
|
"eval_signal/frontier_coverage_25/group_std_mean": 0.26012368500232697,
|
|
"eval_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_25/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_5/centered_abs_mean": 0.20009989539782205,
|
|
"eval_signal/frontier_coverage_5/group_std_mean": 0.26012368500232697,
|
|
"eval_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/frontier_coverage_5/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0031265608655909696,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.025939644935230415,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.04151128667096297,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.003242455616903802,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.003242455616903802,
|
|
"eval_steps_per_second": 0.029,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.11999850001874976,
|
|
"step": 50,
|
|
"train_probe_completions/clipped_ratio": 0.013715277777777776,
|
|
"train_probe_completions/max_length": 2407.6666666666665,
|
|
"train_probe_completions/max_terminated_length": 2407.6666666666665,
|
|
"train_probe_completions/mean_length": 720.0277811686198,
|
|
"train_probe_completions/mean_terminated_length": 730.0629069010416,
|
|
"train_probe_completions/min_length": 0.0,
|
|
"train_probe_completions/min_terminated_length": 217.16666666666666,
|
|
"train_probe_loss": 0.0,
|
|
"train_probe_num_tokens": 98514989.0,
|
|
"train_probe_reward": 1.0258092880249023,
|
|
"train_probe_reward_std": 0.24107951919237772,
|
|
"train_probe_rewards/accuracy_reward": 0.6814236144224802,
|
|
"train_probe_rewards/brier_reward": 0.7728658020496368,
|
|
"train_probe_rewards/confidence_uniqueness_reward": 0.825143297513326,
|
|
"train_probe_rewards/format_reward": 0.9869791666666666,
|
|
"train_probe_rewards/frontier_aurc_reward": -0.0015906431168938677,
|
|
"train_probe_rewards/frontier_coverage_0": -0.041856971802189946,
|
|
"train_probe_rewards/frontier_coverage_1": -0.041856971802189946,
|
|
"train_probe_rewards/frontier_coverage_10": -0.041856971802189946,
|
|
"train_probe_rewards/frontier_coverage_15": -0.041856971802189946,
|
|
"train_probe_rewards/frontier_coverage_20": -0.041856971802189946,
|
|
"train_probe_rewards/frontier_coverage_25": -0.041856971802189946,
|
|
"train_probe_rewards/frontier_coverage_5": -0.041856971802189946,
|
|
"train_probe_rewards/true_frontier_ece_gap_only_reward": -0.028322534635663033,
|
|
"train_probe_runtime": 203.2167,
|
|
"train_probe_samples_per_second": 4.921,
|
|
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.4200846354166667,
|
|
"train_probe_signal/accuracy_reward/group_std_mean": 0.46454379459222156,
|
|
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21004231770833334,
|
|
"train_probe_signal/accuracy_reward/weight": 0.5,
|
|
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.21004231770833334,
|
|
"train_probe_signal/advantage_abs_mean": 0.20514148473739624,
|
|
"train_probe_signal/advantage_pre_scale_abs_mean": 0.20514148473739624,
|
|
"train_probe_signal/advantage_pre_scale_std": 0.24087134500344595,
|
|
"train_probe_signal/advantage_std": 0.24087134500344595,
|
|
"train_probe_signal/brier_reward/centered_abs_mean": 0.1424630656838417,
|
|
"train_probe_signal/brier_reward/group_std_mean": 0.189873273173968,
|
|
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.017807883210480213,
|
|
"train_probe_signal/brier_reward/weight": 0.125,
|
|
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.017807883210480213,
|
|
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.09026772528886795,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.1251646839082241,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.011283465661108494,
|
|
"train_probe_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.011283465661108494,
|
|
"train_probe_signal/format_reward/centered_abs_mean": 0.025010850746184587,
|
|
"train_probe_signal/format_reward/group_std_mean": 0.06767813768237829,
|
|
"train_probe_signal/format_reward/group_zero_std_frac": 0.6388889054457346,
|
|
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.012505425373092294,
|
|
"train_probe_signal/format_reward/weight": 0.5,
|
|
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.012505425373092294,
|
|
"train_probe_signal/frontier_aurc_reward/centered_abs_mean": 0.0015710045505935948,
|
|
"train_probe_signal/frontier_aurc_reward/group_std_mean": 0.0031676616442079344,
|
|
"train_probe_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.454694610302492e-05,
|
|
"train_probe_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"train_probe_signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.454694610302492e-05,
|
|
"train_probe_signal/frontier_coverage_0/centered_abs_mean": 0.2021650398770968,
|
|
"train_probe_signal/frontier_coverage_0/group_std_mean": 0.266250138481458,
|
|
"train_probe_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_0/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_1/centered_abs_mean": 0.2021650398770968,
|
|
"train_probe_signal/frontier_coverage_1/group_std_mean": 0.266250138481458,
|
|
"train_probe_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_1/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_10/centered_abs_mean": 0.2021650398770968,
|
|
"train_probe_signal/frontier_coverage_10/group_std_mean": 0.266250138481458,
|
|
"train_probe_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_10/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_15/centered_abs_mean": 0.2021650398770968,
|
|
"train_probe_signal/frontier_coverage_15/group_std_mean": 0.266250138481458,
|
|
"train_probe_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_15/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_20/centered_abs_mean": 0.2021650398770968,
|
|
"train_probe_signal/frontier_coverage_20/group_std_mean": 0.266250138481458,
|
|
"train_probe_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_20/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_25/centered_abs_mean": 0.2021650398770968,
|
|
"train_probe_signal/frontier_coverage_25/group_std_mean": 0.266250138481458,
|
|
"train_probe_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_25/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_5/centered_abs_mean": 0.2021650398770968,
|
|
"train_probe_signal/frontier_coverage_5/group_std_mean": 0.266250138481458,
|
|
"train_probe_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/frontier_coverage_5/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0031588287480796375,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.02938245516270399,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.04563416292270025,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.003672806895337999,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.003672806895337999,
|
|
"train_probe_steps_per_second": 0.03
|
|
},
|
|
{
|
|
"calibration/aurc": 0.30551913172089973,
|
|
"calibration/batch_distribution_entropy": 0.8602443760010654,
|
|
"calibration/batch_entropy_100bins": 0.736749609443292,
|
|
"calibration/batch_entropy_10bins": 0.8602443760010654,
|
|
"calibration/batch_entropy_50bins": 0.8128175073562515,
|
|
"calibration/batch_uniqueness": 0.9014400949713448,
|
|
"calibration/buffer_distribution_entropy": 0.7210148263994508,
|
|
"calibration/buffer_entropy_100bins": 0.5682523629363754,
|
|
"calibration/buffer_entropy_10bins": 0.7210148263994508,
|
|
"calibration/buffer_entropy_50bins": 0.6571883430263827,
|
|
"calibration/confidence_entropy": 0.5935431137939096,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.0,
|
|
"calibration/coverage@15%": 0.062436579806646866,
|
|
"calibration/coverage@20%": 0.27323636460432554,
|
|
"calibration/coverage@25%": 0.43223015091863515,
|
|
"calibration/coverage@30%": 0.5437253937007874,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.8602443760010654,
|
|
"calibration/distribution_entropy_100": 0.736749609443292,
|
|
"calibration/ece": 0.1438151896191343,
|
|
"calibration/mean_confidence": 0.5757818524322698,
|
|
"calibration/unique_confidence_per_question": 0.17395833333333335,
|
|
"calibration/unique_confidences": 66.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.014496527777777768,
|
|
"completions/max_length": 3464.4,
|
|
"completions/max_terminated_length": 3464.4,
|
|
"completions/mean_length": 735.163037109375,
|
|
"completions/mean_terminated_length": 746.039599609375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 199.2,
|
|
"epoch": 0.13199835002062474,
|
|
"grad_norm": 0.0003842144215013832,
|
|
"learning_rate": 4.60843373493976e-06,
|
|
"loss": -0.0115,
|
|
"num_tokens": 110064643.0,
|
|
"reward": 1.0145560026168823,
|
|
"reward_std": 0.13707308173179628,
|
|
"rewards/accuracy_reward": 0.649218738079071,
|
|
"rewards/brier_reward": 0.7490468740463256,
|
|
"rewards/confidence_uniqueness_reward": 0.8952165722846985,
|
|
"rewards/format_reward": 0.9855034708976745,
|
|
"rewards/frontier_aurc_reward": -0.001880918419919908,
|
|
"rewards/frontier_coverage_0": -0.042861418426036836,
|
|
"rewards/frontier_coverage_1": -0.042861418426036836,
|
|
"rewards/frontier_coverage_10": -0.042861418426036836,
|
|
"rewards/frontier_coverage_15": -0.042861418426036836,
|
|
"rewards/frontier_coverage_20": -0.042861418426036836,
|
|
"rewards/frontier_coverage_25": -0.042861418426036836,
|
|
"rewards/frontier_coverage_5": -0.042861418426036836,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.028965843096375465,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.17503797709941865,
|
|
"signal/accuracy_reward/group_std_mean": 0.22645011842250823,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.37500000596046446,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08751898854970933,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08751898854970933,
|
|
"signal/advantage_abs_mean": 0.10272245854139328,
|
|
"signal/advantage_pre_scale_abs_mean": 0.10272245854139328,
|
|
"signal/advantage_pre_scale_std": 0.16644595563411713,
|
|
"signal/advantage_std": 0.16644595563411713,
|
|
"signal/brier_reward/centered_abs_mean": 0.1416968137025833,
|
|
"signal/brier_reward/group_std_mean": 0.1823040783405304,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.017712101712822913,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.017712101712822913,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06181478276848793,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.08373434096574783,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0077268478460609915,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0077268478460609915,
|
|
"signal/format_reward/centered_abs_mean": 0.022303602285683156,
|
|
"signal/format_reward/group_std_mean": 0.03928558751940727,
|
|
"signal/format_reward/group_zero_std_frac": 0.8472222208976745,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.011151801142841578,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.011151801142841578,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.001758693833835423,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0033450972754508258,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.7479591153678484e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.7479591153678484e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.16558919548988343,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.21207553446292876,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.16558919548988343,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.21207553446292876,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.16558919548988343,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.21207553446292876,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.16558919548988343,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.21207553446292876,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.16558919548988343,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.21207553446292876,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.16558919548988343,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.21207553446292876,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.16558919548988343,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.21207553446292876,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0025873311795294287,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.026407453045248986,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.038250190764665605,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0033009316306561232,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0033009316306561232,
|
|
"step": 55
|
|
},
|
|
{
|
|
"calibration/aurc": 0.3289672986883583,
|
|
"calibration/batch_distribution_entropy": 0.8569993905562576,
|
|
"calibration/batch_entropy_100bins": 0.7871838425661777,
|
|
"calibration/batch_entropy_10bins": 0.8569993905562576,
|
|
"calibration/batch_entropy_50bins": 0.8397181934509937,
|
|
"calibration/batch_uniqueness": 0.9215534730345482,
|
|
"calibration/buffer_distribution_entropy": 0.7594609879037215,
|
|
"calibration/buffer_entropy_100bins": 0.6073773228204736,
|
|
"calibration/buffer_entropy_10bins": 0.7594609879037215,
|
|
"calibration/buffer_entropy_50bins": 0.6953888937595927,
|
|
"calibration/confidence_entropy": 0.5982193870283902,
|
|
"calibration/coverage@0%": 0.004736842105263158,
|
|
"calibration/coverage@1%": 0.004736842105263158,
|
|
"calibration/coverage@10%": 0.090082667401488,
|
|
"calibration/coverage@15%": 0.20160099200881784,
|
|
"calibration/coverage@20%": 0.3591953706255167,
|
|
"calibration/coverage@25%": 0.4010801873794434,
|
|
"calibration/coverage@30%": 0.47177459355194273,
|
|
"calibration/coverage@5%": 0.004736842105263158,
|
|
"calibration/distribution_entropy_10": 0.8569993905562576,
|
|
"calibration/distribution_entropy_100": 0.7871838425661777,
|
|
"calibration/ece": 0.19379714223757522,
|
|
"calibration/mean_confidence": 0.5902312308233718,
|
|
"calibration/unique_confidence_per_question": 0.2088541666666667,
|
|
"calibration/unique_confidences": 80.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01519097222222221,
|
|
"completions/max_length": 3634.8,
|
|
"completions/max_terminated_length": 3634.8,
|
|
"completions/mean_length": 732.9686767578125,
|
|
"completions/mean_terminated_length": 744.3640380859375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 216.0,
|
|
"epoch": 0.14399820002249972,
|
|
"grad_norm": 0.0004008902469649911,
|
|
"learning_rate": 4.457831325301205e-06,
|
|
"loss": -0.0135,
|
|
"num_tokens": 121605018.0,
|
|
"reward": 1.0072944164276123,
|
|
"reward_std": 0.15234946310520173,
|
|
"rewards/accuracy_reward": 0.6269965291023254,
|
|
"rewards/brier_reward": 0.7545630693435669,
|
|
"rewards/confidence_uniqueness_reward": 0.9078129887580871,
|
|
"rewards/format_reward": 0.9844618082046509,
|
|
"rewards/frontier_aurc_reward": -0.0017762274481356144,
|
|
"rewards/frontier_coverage_0": -0.019636033568531275,
|
|
"rewards/frontier_coverage_1": -0.019636033568531275,
|
|
"rewards/frontier_coverage_10": -0.019636033568531275,
|
|
"rewards/frontier_coverage_15": -0.019636033568531275,
|
|
"rewards/frontier_coverage_20": -0.019636033568531275,
|
|
"rewards/frontier_coverage_25": -0.019636033568531275,
|
|
"rewards/frontier_coverage_5": -0.019636033568531275,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.032450299337506296,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.19198676347732543,
|
|
"signal/accuracy_reward/group_std_mean": 0.24801050424575805,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.3194444477558136,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09599338173866272,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09599338173866272,
|
|
"signal/advantage_abs_mean": 0.11215932667255402,
|
|
"signal/advantage_pre_scale_abs_mean": 0.11215932667255402,
|
|
"signal/advantage_pre_scale_std": 0.17976947426795958,
|
|
"signal/advantage_std": 0.17976947426795958,
|
|
"signal/brier_reward/centered_abs_mean": 0.14585065245628356,
|
|
"signal/brier_reward/group_std_mean": 0.18760787844657897,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.018231331557035445,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.018231331557035445,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06241054162383079,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.09066012054681778,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.007801317702978849,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007801317702978849,
|
|
"signal/format_reward/centered_abs_mean": 0.025927734375,
|
|
"signal/format_reward/group_std_mean": 0.05031422972679138,
|
|
"signal/format_reward/group_zero_std_frac": 0.7861111164093018,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0129638671875,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.0129638671875,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.002507622819393873,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.004252730589359999,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.918160655302927e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.918160655302927e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.15634405016899108,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.20499549806118011,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.15634405016899108,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.20499549806118011,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.15634405016899108,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.20499549806118011,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.15634405016899108,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.20499549806118011,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.15634405016899108,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.20499549806118011,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.15634405016899108,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.20499549806118011,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.15634405016899108,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.20499549806118011,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0024428757838904857,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.029747573658823967,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.04069453105330467,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.003718446707352996,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.003718446707352996,
|
|
"step": 60
|
|
},
|
|
{
|
|
"calibration/aurc": 0.27763096415418204,
|
|
"calibration/batch_distribution_entropy": 0.8015767599827139,
|
|
"calibration/batch_entropy_100bins": 0.7749875634194134,
|
|
"calibration/batch_entropy_10bins": 0.8015767599827139,
|
|
"calibration/batch_entropy_50bins": 0.8129389718529344,
|
|
"calibration/batch_uniqueness": 0.9117152414913241,
|
|
"calibration/buffer_distribution_entropy": 0.7814829282050988,
|
|
"calibration/buffer_entropy_100bins": 0.6385089459762243,
|
|
"calibration/buffer_entropy_10bins": 0.7814829282050988,
|
|
"calibration/buffer_entropy_50bins": 0.7230478450681841,
|
|
"calibration/confidence_entropy": 0.5883703123093914,
|
|
"calibration/coverage@0%": 0.0,
|
|
"calibration/coverage@1%": 0.0,
|
|
"calibration/coverage@10%": 0.005774278215223097,
|
|
"calibration/coverage@15%": 0.2747384205246596,
|
|
"calibration/coverage@20%": 0.5393372634703384,
|
|
"calibration/coverage@25%": 0.6022603839441535,
|
|
"calibration/coverage@30%": 0.7034666666666667,
|
|
"calibration/coverage@5%": 0.0,
|
|
"calibration/distribution_entropy_10": 0.8015767599827139,
|
|
"calibration/distribution_entropy_100": 0.7749875634194134,
|
|
"calibration/ece": 0.13966504962011914,
|
|
"calibration/mean_confidence": 0.639796467979542,
|
|
"calibration/unique_confidence_per_question": 0.18958333333333333,
|
|
"calibration/unique_confidences": 72.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.013368055555555581,
|
|
"completions/max_length": 3414.8,
|
|
"completions/max_terminated_length": 3414.8,
|
|
"completions/mean_length": 708.5317016601563,
|
|
"completions/mean_terminated_length": 718.0807495117188,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 209.4,
|
|
"epoch": 0.1559980500243747,
|
|
"grad_norm": 0.0004595128120854497,
|
|
"learning_rate": 4.307228915662651e-06,
|
|
"loss": -0.0113,
|
|
"num_tokens": 132861351.0,
|
|
"reward": 1.0280081152915954,
|
|
"reward_std": 0.13622777462005614,
|
|
"rewards/accuracy_reward": 0.6611979246139527,
|
|
"rewards/brier_reward": 0.7732649683952332,
|
|
"rewards/confidence_uniqueness_reward": 0.9014915823936462,
|
|
"rewards/format_reward": 0.9864583373069763,
|
|
"rewards/frontier_aurc_reward": -0.0018865561811253428,
|
|
"rewards/frontier_coverage_0": -0.024941197596490383,
|
|
"rewards/frontier_coverage_1": -0.024941197596490383,
|
|
"rewards/frontier_coverage_10": -0.024941197596490383,
|
|
"rewards/frontier_coverage_15": -0.024941197596490383,
|
|
"rewards/frontier_coverage_20": -0.024941197596490383,
|
|
"rewards/frontier_coverage_25": -0.024941197596490383,
|
|
"rewards/frontier_coverage_5": -0.024941197596490383,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.019257388636469842,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16076931357383728,
|
|
"signal/accuracy_reward/group_std_mean": 0.21609613299369812,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.3694444477558136,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08038465678691864,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08038465678691864,
|
|
"signal/advantage_abs_mean": 0.09798353910446167,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09798353910446167,
|
|
"signal/advantage_pre_scale_std": 0.1702731281518936,
|
|
"signal/advantage_std": 0.1702731281518936,
|
|
"signal/brier_reward/centered_abs_mean": 0.11966974288225174,
|
|
"signal/brier_reward/group_std_mean": 0.1555977314710617,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014958717860281467,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014958717860281467,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07107813656330109,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.09667231291532516,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008884767070412636,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008884767070412636,
|
|
"signal/format_reward/centered_abs_mean": 0.02330729179084301,
|
|
"signal/format_reward/group_std_mean": 0.043983825296163556,
|
|
"signal/format_reward/group_zero_std_frac": 0.8222222328186035,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.011653645895421505,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.011653645895421505,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0014523085206747054,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0024229245027527213,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.2692320635542272e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.2692320635542272e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.10854218900203705,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.14894305765628815,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.10854218900203705,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.14894305765628815,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.10854218900203705,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.14894305765628815,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.10854218900203705,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.14894305765628815,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.10854218900203705,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.14894305765628815,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.10854218900203705,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.14894305765628815,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.10854218900203705,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.14894305765628815,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0016959717031568289,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0190825667232275,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.02821722887456417,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0023853208404034376,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0023853208404034376,
|
|
"step": 65
|
|
},
|
|
{
|
|
"calibration/aurc": 0.2942477916112748,
|
|
"calibration/batch_distribution_entropy": 0.7283107979539343,
|
|
"calibration/batch_entropy_100bins": 0.7784214972740136,
|
|
"calibration/batch_entropy_10bins": 0.7283107979539343,
|
|
"calibration/batch_entropy_50bins": 0.7914896701898086,
|
|
"calibration/batch_uniqueness": 0.9236069702763444,
|
|
"calibration/buffer_distribution_entropy": 0.794101813716716,
|
|
"calibration/buffer_entropy_100bins": 0.6672103969274696,
|
|
"calibration/buffer_entropy_10bins": 0.794101813716716,
|
|
"calibration/buffer_entropy_50bins": 0.746071032044594,
|
|
"calibration/confidence_entropy": 0.5846905333336511,
|
|
"calibration/coverage@0%": 0.0199668754084534,
|
|
"calibration/coverage@1%": 0.0199668754084534,
|
|
"calibration/coverage@10%": 0.05303774155018569,
|
|
"calibration/coverage@15%": 0.07666255160682335,
|
|
"calibration/coverage@20%": 0.23386797400915085,
|
|
"calibration/coverage@25%": 0.3064000917049651,
|
|
"calibration/coverage@30%": 0.49787412509417434,
|
|
"calibration/coverage@5%": 0.0330902349885059,
|
|
"calibration/distribution_entropy_10": 0.7283107979539343,
|
|
"calibration/distribution_entropy_100": 0.7784214972740136,
|
|
"calibration/ece": 0.091677989362732,
|
|
"calibration/mean_confidence": 0.6715764005231176,
|
|
"calibration/unique_confidence_per_question": 0.171875,
|
|
"calibration/unique_confidences": 66.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.010156250000000023,
|
|
"completions/max_length": 3671.2,
|
|
"completions/max_terminated_length": 3671.2,
|
|
"completions/mean_length": 689.2970703125,
|
|
"completions/mean_terminated_length": 696.3696044921875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 175.0,
|
|
"epoch": 0.16799790002624967,
|
|
"grad_norm": 0.00040550867561250925,
|
|
"learning_rate": 4.156626506024097e-06,
|
|
"loss": -0.0087,
|
|
"num_tokens": 143880197.0,
|
|
"reward": 1.0235817909240723,
|
|
"reward_std": 0.13608680069446563,
|
|
"rewards/accuracy_reward": 0.6411458253860474,
|
|
"rewards/brier_reward": 0.771563458442688,
|
|
"rewards/confidence_uniqueness_reward": 0.9243876576423645,
|
|
"rewards/format_reward": 0.9894965291023254,
|
|
"rewards/frontier_aurc_reward": -0.0020083141047507525,
|
|
"rewards/frontier_coverage_0": -0.014133398490957915,
|
|
"rewards/frontier_coverage_1": -0.014133398490957915,
|
|
"rewards/frontier_coverage_10": -0.014133398490957915,
|
|
"rewards/frontier_coverage_15": -0.014133398490957915,
|
|
"rewards/frontier_coverage_20": -0.014133398490957915,
|
|
"rewards/frontier_coverage_25": -0.014133398490957915,
|
|
"rewards/frontier_coverage_5": -0.014133398490957915,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.017248846217989923,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1690972238779068,
|
|
"signal/accuracy_reward/group_std_mean": 0.2230025738477707,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.3694444477558136,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0845486119389534,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0845486119389534,
|
|
"signal/advantage_abs_mean": 0.09818485230207444,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09818485230207444,
|
|
"signal/advantage_pre_scale_std": 0.16889992356300354,
|
|
"signal/advantage_std": 0.16889992356300354,
|
|
"signal/brier_reward/centered_abs_mean": 0.11540952920913697,
|
|
"signal/brier_reward/group_std_mean": 0.1515179991722107,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014426191151142121,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014426191151142121,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.047419081628322604,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.07018115222454072,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0059273852035403255,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0059273852035403255,
|
|
"signal/format_reward/centered_abs_mean": 0.01873372420668602,
|
|
"signal/format_reward/group_std_mean": 0.03724060095846653,
|
|
"signal/format_reward/group_zero_std_frac": 0.8416666626930237,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00936686210334301,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.00936686210334301,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0013382966397330164,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0020241386722773314,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.0910884995828382e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.0910884995828382e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.10258018672466278,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.14305810928344725,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.10258018672466278,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.14305810928344725,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.10258018672466278,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.14305810928344725,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.10258018672466278,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.14305810928344725,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.10258018672466278,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.14305810928344725,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.10258018672466278,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.14305810928344725,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.10258018672466278,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.14305810928344725,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001602815417572856,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.016778473183512686,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.023937665671110154,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.002097309147939086,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.002097309147939086,
|
|
"step": 70
|
|
},
|
|
{
|
|
"calibration/aurc": 0.26809441942504275,
|
|
"calibration/batch_distribution_entropy": 0.6838393931393899,
|
|
"calibration/batch_entropy_100bins": 0.7729783643288141,
|
|
"calibration/batch_entropy_10bins": 0.6838393931393899,
|
|
"calibration/batch_entropy_50bins": 0.7797124119634354,
|
|
"calibration/batch_uniqueness": 0.9257592753961467,
|
|
"calibration/buffer_distribution_entropy": 0.8014343512506427,
|
|
"calibration/buffer_entropy_100bins": 0.6965847069678281,
|
|
"calibration/buffer_entropy_10bins": 0.8014343512506427,
|
|
"calibration/buffer_entropy_50bins": 0.7671058988973287,
|
|
"calibration/confidence_entropy": 0.5651903684955479,
|
|
"calibration/coverage@0%": 0.007869979733363341,
|
|
"calibration/coverage@1%": 0.007869979733363341,
|
|
"calibration/coverage@10%": 0.007869979733363341,
|
|
"calibration/coverage@15%": 0.22780781645114204,
|
|
"calibration/coverage@20%": 0.257160283163038,
|
|
"calibration/coverage@25%": 0.46021289135147203,
|
|
"calibration/coverage@30%": 0.6366343893697362,
|
|
"calibration/coverage@5%": 0.007869979733363341,
|
|
"calibration/distribution_entropy_10": 0.6838393931393899,
|
|
"calibration/distribution_entropy_100": 0.7729783643288141,
|
|
"calibration/ece": 0.11109141408517027,
|
|
"calibration/mean_confidence": 0.7044527597089597,
|
|
"calibration/unique_confidence_per_question": 0.16354166666666664,
|
|
"calibration/unique_confidences": 62.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00703125,
|
|
"completions/max_length": 3003.8,
|
|
"completions/max_terminated_length": 3003.8,
|
|
"completions/mean_length": 662.574658203125,
|
|
"completions/mean_terminated_length": 667.2730590820313,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 191.8,
|
|
"epoch": 0.17999775002812465,
|
|
"grad_norm": 0.000408834166591987,
|
|
"learning_rate": 4.006024096385543e-06,
|
|
"loss": -0.0048,
|
|
"num_tokens": 154577953.0,
|
|
"reward": 1.052880334854126,
|
|
"reward_std": 0.13121603578329086,
|
|
"rewards/accuracy_reward": 0.6934027791023254,
|
|
"rewards/brier_reward": 0.7911527037620545,
|
|
"rewards/confidence_uniqueness_reward": 0.9237256646156311,
|
|
"rewards/format_reward": 0.9928819537162781,
|
|
"rewards/frontier_aurc_reward": -0.001983778248541057,
|
|
"rewards/frontier_coverage_0": -0.02244817279279232,
|
|
"rewards/frontier_coverage_1": -0.02244817279279232,
|
|
"rewards/frontier_coverage_10": -0.02244817279279232,
|
|
"rewards/frontier_coverage_15": -0.02244817279279232,
|
|
"rewards/frontier_coverage_20": -0.02244817279279232,
|
|
"rewards/frontier_coverage_25": -0.02244817279279232,
|
|
"rewards/frontier_coverage_5": -0.02244817279279232,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.01708451323211193,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.17001952826976777,
|
|
"signal/accuracy_reward/group_std_mean": 0.22074966430664061,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.38333333730697633,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08500976413488388,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08500976413488388,
|
|
"signal/advantage_abs_mean": 0.09748768210411071,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09748768210411071,
|
|
"signal/advantage_pre_scale_std": 0.16811644434928893,
|
|
"signal/advantage_std": 0.16811644434928893,
|
|
"signal/brier_reward/centered_abs_mean": 0.110137939453125,
|
|
"signal/brier_reward/group_std_mean": 0.14241448044776917,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013767242431640625,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013767242431640625,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.0404249906539917,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05972475409507751,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005053123831748963,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005053123831748963,
|
|
"signal/format_reward/centered_abs_mean": 0.012988281436264515,
|
|
"signal/format_reward/group_std_mean": 0.02832689881324768,
|
|
"signal/format_reward/group_zero_std_frac": 0.8694444537162781,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0064941407181322575,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.0064941407181322575,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0014353625476360321,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.002100939303636551,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.2427539806813002e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.2427539806813002e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.07951787561178207,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1129288211464882,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.07951787561178207,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1129288211464882,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.07951787561178207,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1129288211464882,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.07951787561178207,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.1129288211464882,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.07951787561178207,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.1129288211464882,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.07951787561178207,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.1129288211464882,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.07951787561178207,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1129288211464882,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0012424668064340949,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.016805017180740834,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.023270204663276672,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0021006271475926042,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0021006271475926042,
|
|
"step": 75
|
|
},
|
|
{
|
|
"calibration/aurc": 0.21834242019243638,
|
|
"calibration/batch_distribution_entropy": 0.6654728425240195,
|
|
"calibration/batch_entropy_100bins": 0.7709733639402805,
|
|
"calibration/batch_entropy_10bins": 0.6654728425240195,
|
|
"calibration/batch_entropy_50bins": 0.7749415689063472,
|
|
"calibration/batch_uniqueness": 0.9253690248872815,
|
|
"calibration/buffer_distribution_entropy": 0.8056100350450295,
|
|
"calibration/buffer_entropy_100bins": 0.7215675673965605,
|
|
"calibration/buffer_entropy_10bins": 0.8056100350450295,
|
|
"calibration/buffer_entropy_50bins": 0.7838614791193015,
|
|
"calibration/confidence_entropy": 0.545574600148968,
|
|
"calibration/coverage@0%": 0.02220438511574777,
|
|
"calibration/coverage@1%": 0.02220438511574777,
|
|
"calibration/coverage@10%": 0.23966102127612107,
|
|
"calibration/coverage@15%": 0.34606994679725095,
|
|
"calibration/coverage@20%": 0.5504157493778901,
|
|
"calibration/coverage@25%": 0.6507980412356587,
|
|
"calibration/coverage@30%": 0.7648198585964071,
|
|
"calibration/coverage@5%": 0.07932105432846319,
|
|
"calibration/distribution_entropy_10": 0.6654728425240195,
|
|
"calibration/distribution_entropy_100": 0.7709733639402805,
|
|
"calibration/ece": 0.12195687052074375,
|
|
"calibration/mean_confidence": 0.7164405583247829,
|
|
"calibration/unique_confidence_per_question": 0.1703125,
|
|
"calibration/unique_confidences": 65.4,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01137152777777779,
|
|
"completions/max_length": 3956.8,
|
|
"completions/max_terminated_length": 3956.8,
|
|
"completions/mean_length": 682.140283203125,
|
|
"completions/mean_terminated_length": 690.0593139648438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 193.0,
|
|
"epoch": 0.19199760002999963,
|
|
"grad_norm": 0.0004445287340786308,
|
|
"learning_rate": 3.855421686746989e-06,
|
|
"loss": -0.0085,
|
|
"num_tokens": 165489489.0,
|
|
"reward": 1.0320314049720765,
|
|
"reward_std": 0.1364010527729988,
|
|
"rewards/accuracy_reward": 0.6573784589767456,
|
|
"rewards/brier_reward": 0.7777002453804016,
|
|
"rewards/confidence_uniqueness_reward": 0.919914448261261,
|
|
"rewards/format_reward": 0.9884548544883728,
|
|
"rewards/frontier_aurc_reward": -0.002187199471518397,
|
|
"rewards/frontier_coverage_0": -0.009425394237041473,
|
|
"rewards/frontier_coverage_1": -0.009425394237041473,
|
|
"rewards/frontier_coverage_10": -0.009425394237041473,
|
|
"rewards/frontier_coverage_15": -0.009425394237041473,
|
|
"rewards/frontier_coverage_20": -0.009425394237041473,
|
|
"rewards/frontier_coverage_25": -0.009425394237041473,
|
|
"rewards/frontier_coverage_5": -0.009425394237041473,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.016176528483629226,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16724717915058135,
|
|
"signal/accuracy_reward/group_std_mean": 0.22416210770606995,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.35,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08362358957529067,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08362358957529067,
|
|
"signal/advantage_abs_mean": 0.09906900972127915,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09906900972127915,
|
|
"signal/advantage_pre_scale_std": 0.16992701590061188,
|
|
"signal/advantage_std": 0.16992701590061188,
|
|
"signal/brier_reward/centered_abs_mean": 0.11396953910589218,
|
|
"signal/brier_reward/group_std_mean": 0.14948717951774598,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014246192388236522,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014246192388236522,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.041234496235847476,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06095789596438408,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0051543120294809345,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0051543120294809345,
|
|
"signal/format_reward/centered_abs_mean": 0.01700846329331398,
|
|
"signal/format_reward/group_std_mean": 0.03214373588562012,
|
|
"signal/format_reward/group_zero_std_frac": 0.8666666746139526,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00850423164665699,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.00850423164665699,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0015931333182379603,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0023470679763704537,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.489270809746813e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.489270809746813e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.07731934040784835,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.10898190438747406,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.07731934040784835,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.10898190438747406,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.07731934040784835,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.10898190438747406,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.07731934040784835,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.10898190438747406,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.07731934040784835,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.10898190438747406,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.07731934040784835,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.10898190438747406,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.07731934040784835,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.10898190438747406,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0012081146938726305,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.015519179962575435,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.021731919422745705,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0019398974953219294,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0019398974953219294,
|
|
"step": 80
|
|
},
|
|
{
|
|
"calibration/aurc": 0.223200967338523,
|
|
"calibration/batch_distribution_entropy": 0.7470308632593083,
|
|
"calibration/batch_entropy_100bins": 0.8097957063944881,
|
|
"calibration/batch_entropy_10bins": 0.7470308632593083,
|
|
"calibration/batch_entropy_50bins": 0.8235600550504187,
|
|
"calibration/batch_uniqueness": 0.9389559027110078,
|
|
"calibration/buffer_distribution_entropy": 0.810030947507659,
|
|
"calibration/buffer_entropy_100bins": 0.7424741081330337,
|
|
"calibration/buffer_entropy_10bins": 0.810030947507659,
|
|
"calibration/buffer_entropy_50bins": 0.7981797753927561,
|
|
"calibration/confidence_entropy": 0.5852915959437024,
|
|
"calibration/coverage@0%": 0.009550042580097244,
|
|
"calibration/coverage@1%": 0.009550042580097244,
|
|
"calibration/coverage@10%": 0.13673516904381933,
|
|
"calibration/coverage@15%": 0.25685554970229674,
|
|
"calibration/coverage@20%": 0.41418009626638747,
|
|
"calibration/coverage@25%": 0.6514727170160993,
|
|
"calibration/coverage@30%": 0.8271656731884673,
|
|
"calibration/coverage@5%": 0.04056608536084591,
|
|
"calibration/distribution_entropy_10": 0.7470308632593083,
|
|
"calibration/distribution_entropy_100": 0.8097957063944881,
|
|
"calibration/ece": 0.09254755119264485,
|
|
"calibration/mean_confidence": 0.6642736722712816,
|
|
"calibration/unique_confidence_per_question": 0.1880208333333333,
|
|
"calibration/unique_confidences": 72.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.008159722222222231,
|
|
"completions/max_length": 3769.2,
|
|
"completions/max_terminated_length": 3769.2,
|
|
"completions/mean_length": 685.8019897460938,
|
|
"completions/mean_terminated_length": 691.5063598632812,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 200.0,
|
|
"epoch": 0.2039974500318746,
|
|
"grad_norm": 0.00041797629091888666,
|
|
"learning_rate": 3.7048192771084342e-06,
|
|
"loss": -0.0068,
|
|
"num_tokens": 176477128.0,
|
|
"reward": 1.0475164890289306,
|
|
"reward_std": 0.1313488855957985,
|
|
"rewards/accuracy_reward": 0.6796006917953491,
|
|
"rewards/brier_reward": 0.7933130860328674,
|
|
"rewards/confidence_uniqueness_reward": 0.9283158540725708,
|
|
"rewards/format_reward": 0.9916666746139526,
|
|
"rewards/frontier_aurc_reward": -0.0017628843430429696,
|
|
"rewards/frontier_coverage_0": -0.01574636101722717,
|
|
"rewards/frontier_coverage_1": -0.01574636101722717,
|
|
"rewards/frontier_coverage_10": -0.01574636101722717,
|
|
"rewards/frontier_coverage_15": -0.01574636101722717,
|
|
"rewards/frontier_coverage_20": -0.01574636101722717,
|
|
"rewards/frontier_coverage_25": -0.01574636101722717,
|
|
"rewards/frontier_coverage_5": -0.01574636101722717,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.01256832219660282,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1685926616191864,
|
|
"signal/accuracy_reward/group_std_mean": 0.22146643698215485,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.36666667461395264,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0842963308095932,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0842963308095932,
|
|
"signal/advantage_abs_mean": 0.096477310359478,
|
|
"signal/advantage_pre_scale_abs_mean": 0.096477310359478,
|
|
"signal/advantage_pre_scale_std": 0.16670409142971038,
|
|
"signal/advantage_std": 0.16670409142971038,
|
|
"signal/brier_reward/centered_abs_mean": 0.10519883632659913,
|
|
"signal/brier_reward/group_std_mean": 0.13862771689891815,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01314985454082489,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01314985454082489,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.037385327741503716,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.058187781274318694,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0046731659676879644,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0046731659676879644,
|
|
"signal/format_reward/centered_abs_mean": 0.014822048507630825,
|
|
"signal/format_reward/group_std_mean": 0.03110768012702465,
|
|
"signal/format_reward/group_zero_std_frac": 0.8583333253860473,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007411024253815413,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.007411024253815413,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0013182483380660415,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0019752333406358956,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.0597630282281898e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.0597630282281898e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.08810206353664399,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1211852788925171,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.08810206353664399,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1211852788925171,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.08810206353664399,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1211852788925171,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.08810206353664399,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.1211852788925171,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.08810206353664399,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.1211852788925171,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.08810206353664399,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.1211852788925171,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.08810206353664399,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1211852788925171,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0013765947427600623,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.01288688350468874,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.019262754917144777,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0016108604380860926,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0016108604380860926,
|
|
"step": 85
|
|
},
|
|
{
|
|
"calibration/aurc": 0.1750125348445183,
|
|
"calibration/batch_distribution_entropy": 0.7160344664199889,
|
|
"calibration/batch_entropy_100bins": 0.791538051421172,
|
|
"calibration/batch_entropy_10bins": 0.7160344664199889,
|
|
"calibration/batch_entropy_50bins": 0.803385649917867,
|
|
"calibration/batch_uniqueness": 0.9360382132701994,
|
|
"calibration/buffer_distribution_entropy": 0.8130252160812972,
|
|
"calibration/buffer_entropy_100bins": 0.7592675172132644,
|
|
"calibration/buffer_entropy_10bins": 0.8130252160812972,
|
|
"calibration/buffer_entropy_50bins": 0.8095419172681041,
|
|
"calibration/confidence_entropy": 0.5752702254035462,
|
|
"calibration/coverage@0%": 0.021076727388243136,
|
|
"calibration/coverage@1%": 0.021076727388243136,
|
|
"calibration/coverage@10%": 0.28257062422924945,
|
|
"calibration/coverage@15%": 0.46590979360010365,
|
|
"calibration/coverage@20%": 0.6646229888828084,
|
|
"calibration/coverage@25%": 0.8164021164021165,
|
|
"calibration/coverage@30%": 0.8899470899470898,
|
|
"calibration/coverage@5%": 0.05624733106278381,
|
|
"calibration/distribution_entropy_10": 0.7160344664199889,
|
|
"calibration/distribution_entropy_100": 0.791538051421172,
|
|
"calibration/ece": 0.07857751360789,
|
|
"calibration/mean_confidence": 0.6838434656307941,
|
|
"calibration/unique_confidence_per_question": 0.16197916666666667,
|
|
"calibration/unique_confidences": 62.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.011892361111111116,
|
|
"completions/max_length": 2865.4,
|
|
"completions/max_terminated_length": 2865.4,
|
|
"completions/mean_length": 644.41875,
|
|
"completions/mean_terminated_length": 652.1775756835938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 177.6,
|
|
"epoch": 0.2159973000337496,
|
|
"grad_norm": 0.0004885443486273289,
|
|
"learning_rate": 3.5542168674698798e-06,
|
|
"loss": -0.01,
|
|
"num_tokens": 186969504.0,
|
|
"reward": 1.0422258853912354,
|
|
"reward_std": 0.1346314489841461,
|
|
"rewards/accuracy_reward": 0.6743055701255798,
|
|
"rewards/brier_reward": 0.7906572461128235,
|
|
"rewards/confidence_uniqueness_reward": 0.9217118740081787,
|
|
"rewards/format_reward": 0.9880208373069763,
|
|
"rewards/frontier_aurc_reward": -0.001758960704319179,
|
|
"rewards/frontier_coverage_0": -0.01478583961725235,
|
|
"rewards/frontier_coverage_1": -0.01478583961725235,
|
|
"rewards/frontier_coverage_10": -0.01478583961725235,
|
|
"rewards/frontier_coverage_15": -0.01478583961725235,
|
|
"rewards/frontier_coverage_20": -0.01478583961725235,
|
|
"rewards/frontier_coverage_25": -0.01478583961725235,
|
|
"rewards/frontier_coverage_5": -0.01478583961725235,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.01070992909371853,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16719835251569748,
|
|
"signal/accuracy_reward/group_std_mean": 0.2174463987350464,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.38888888955116274,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08359917625784874,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08359917625784874,
|
|
"signal/advantage_abs_mean": 0.0994023248553276,
|
|
"signal/advantage_pre_scale_abs_mean": 0.0994023248553276,
|
|
"signal/advantage_pre_scale_std": 0.1722485601902008,
|
|
"signal/advantage_std": 0.1722485601902008,
|
|
"signal/brier_reward/centered_abs_mean": 0.10387470126152039,
|
|
"signal/brier_reward/group_std_mean": 0.13654259741306304,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.012984337657690049,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.012984337657690049,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.043320811539888385,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06586214751005173,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005415101442486048,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005415101442486048,
|
|
"signal/format_reward/centered_abs_mean": 0.02032335065305233,
|
|
"signal/format_reward/group_std_mean": 0.03864026740193367,
|
|
"signal/format_reward/group_zero_std_frac": 0.8388888955116272,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.010161675326526166,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.010161675326526166,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0012993402313441038,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0018939806381240488,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.030219111475162e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.030219111475162e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.07952361851930619,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.10869008004665374,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.07952361851930619,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.10869008004665374,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.07952361851930619,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.10869008004665374,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.07952361851930619,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.10869008004665374,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.07952361851930619,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.10869008004665374,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.07952361851930619,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.10869008004665374,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.07952361851930619,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.10869008004665374,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0012425565393641592,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.010778117179870605,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.01537728812545538,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0013472646474838256,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0013472646474838256,
|
|
"step": 90
|
|
},
|
|
{
|
|
"calibration/aurc": 0.20718600424081587,
|
|
"calibration/batch_distribution_entropy": 0.7145419993415512,
|
|
"calibration/batch_entropy_100bins": 0.7940045869247137,
|
|
"calibration/batch_entropy_10bins": 0.7145419993415512,
|
|
"calibration/batch_entropy_50bins": 0.8040417294748641,
|
|
"calibration/batch_uniqueness": 0.9367676324025431,
|
|
"calibration/buffer_distribution_entropy": 0.8145125153648216,
|
|
"calibration/buffer_entropy_100bins": 0.7725883097693098,
|
|
"calibration/buffer_entropy_10bins": 0.8145125153648216,
|
|
"calibration/buffer_entropy_50bins": 0.817964486415557,
|
|
"calibration/confidence_entropy": 0.5770644535474301,
|
|
"calibration/coverage@0%": 0.018359180375690363,
|
|
"calibration/coverage@1%": 0.018359180375690363,
|
|
"calibration/coverage@10%": 0.3142551794049478,
|
|
"calibration/coverage@15%": 0.47292650936860287,
|
|
"calibration/coverage@20%": 0.5374353404763397,
|
|
"calibration/coverage@25%": 0.6198096590711307,
|
|
"calibration/coverage@30%": 0.70205074477485,
|
|
"calibration/coverage@5%": 0.15136133782852185,
|
|
"calibration/distribution_entropy_10": 0.7145419993415512,
|
|
"calibration/distribution_entropy_100": 0.7940045869247137,
|
|
"calibration/ece": 0.12296266760467423,
|
|
"calibration/mean_confidence": 0.6829029134911659,
|
|
"calibration/unique_confidence_per_question": 0.15885416666666669,
|
|
"calibration/unique_confidences": 61.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00824652777777779,
|
|
"completions/max_length": 3075.4,
|
|
"completions/max_terminated_length": 3075.4,
|
|
"completions/mean_length": 649.2624145507813,
|
|
"completions/mean_terminated_length": 654.6318359375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 189.0,
|
|
"epoch": 0.22799715003562457,
|
|
"grad_norm": 0.00040514481952413917,
|
|
"learning_rate": 3.4036144578313257e-06,
|
|
"loss": -0.0068,
|
|
"num_tokens": 197540687.0,
|
|
"reward": 1.0414008617401123,
|
|
"reward_std": 0.12309487164020538,
|
|
"rewards/accuracy_reward": 0.6670138955116272,
|
|
"rewards/brier_reward": 0.7899128198623657,
|
|
"rewards/confidence_uniqueness_reward": 0.926601231098175,
|
|
"rewards/format_reward": 0.9917534589767456,
|
|
"rewards/frontier_aurc_reward": -0.0018277077469974756,
|
|
"rewards/frontier_coverage_0": -0.012428297474980355,
|
|
"rewards/frontier_coverage_1": -0.012428297474980355,
|
|
"rewards/frontier_coverage_10": -0.012428297474980355,
|
|
"rewards/frontier_coverage_15": -0.012428297474980355,
|
|
"rewards/frontier_coverage_20": -0.012428297474980355,
|
|
"rewards/frontier_coverage_25": -0.012428297474980355,
|
|
"rewards/frontier_coverage_5": -0.012428297474980355,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.009273872710764408,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.15275607407093048,
|
|
"signal/accuracy_reward/group_std_mean": 0.2017911434173584,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4277777850627899,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07637803703546524,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07637803703546524,
|
|
"signal/advantage_abs_mean": 0.08974921703338623,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08974921703338623,
|
|
"signal/advantage_pre_scale_std": 0.16027459800243377,
|
|
"signal/advantage_std": 0.16027459800243377,
|
|
"signal/brier_reward/centered_abs_mean": 0.09923952370882035,
|
|
"signal/brier_reward/group_std_mean": 0.13173434436321257,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.012404940463602543,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.012404940463602543,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03722200207412243,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05656850188970566,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0046527502592653034,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0046527502592653034,
|
|
"signal/format_reward/centered_abs_mean": 0.014653862826526165,
|
|
"signal/format_reward/group_std_mean": 0.0300372663885355,
|
|
"signal/format_reward/group_zero_std_frac": 0.8694444537162781,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007326931413263083,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.007326931413263083,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.001311829499900341,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0018943335162475705,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.0497335935942828e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.0497335935942828e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.07780194133520127,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.10721372663974763,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.07780194133520127,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.10721372663974763,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.07780194133520127,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.10721372663974763,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.07780194133520127,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.10721372663974763,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.07780194133520127,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.10721372663974763,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.07780194133520127,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.10721372663974763,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.07780194133520127,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.10721372663974763,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0012156553333625198,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.009020444191992282,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.012560129538178444,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0011275555239990353,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0011275555239990353,
|
|
"step": 95
|
|
},
|
|
{
|
|
"calibration/aurc": 0.18912161303723035,
|
|
"calibration/batch_distribution_entropy": 0.7509673473140573,
|
|
"calibration/batch_entropy_100bins": 0.8107610890298,
|
|
"calibration/batch_entropy_10bins": 0.7509673473140573,
|
|
"calibration/batch_entropy_50bins": 0.8259464172988149,
|
|
"calibration/batch_uniqueness": 0.9409531377801335,
|
|
"calibration/buffer_distribution_entropy": 0.8153438800401813,
|
|
"calibration/buffer_entropy_100bins": 0.7838405798633927,
|
|
"calibration/buffer_entropy_10bins": 0.8153438800401813,
|
|
"calibration/buffer_entropy_50bins": 0.8251483429084174,
|
|
"calibration/confidence_entropy": 0.5673696735876815,
|
|
"calibration/coverage@0%": 0.005273188206117388,
|
|
"calibration/coverage@1%": 0.005273188206117388,
|
|
"calibration/coverage@10%": 0.14509694130614492,
|
|
"calibration/coverage@15%": 0.41793198434036133,
|
|
"calibration/coverage@20%": 0.6852799572454381,
|
|
"calibration/coverage@25%": 0.8459034433939087,
|
|
"calibration/coverage@30%": 0.9333465402575349,
|
|
"calibration/coverage@5%": 0.005273188206117388,
|
|
"calibration/distribution_entropy_10": 0.7509673473140573,
|
|
"calibration/distribution_entropy_100": 0.8107610890298,
|
|
"calibration/ece": 0.08220890696497,
|
|
"calibration/mean_confidence": 0.6813237522023581,
|
|
"calibration/unique_confidence_per_question": 0.17552083333333335,
|
|
"calibration/unique_confidences": 67.4,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.012934027777777768,
|
|
"completions/max_length": 3705.4,
|
|
"completions/max_terminated_length": 3705.4,
|
|
"completions/mean_length": 671.973193359375,
|
|
"completions/mean_terminated_length": 680.76796875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 188.0,
|
|
"epoch": 0.23999700003749952,
|
|
"grad_norm": 0.0004094898758921772,
|
|
"learning_rate": 3.2530120481927713e-06,
|
|
"loss": -0.0104,
|
|
"num_tokens": 208380890.0,
|
|
"reward": 1.048251223564148,
|
|
"reward_std": 0.13225021511316298,
|
|
"rewards/accuracy_reward": 0.6796875,
|
|
"rewards/brier_reward": 0.8040428400039673,
|
|
"rewards/confidence_uniqueness_reward": 0.9271102786064148,
|
|
"rewards/format_reward": 0.9870659828186035,
|
|
"rewards/frontier_aurc_reward": -0.0017124064732342958,
|
|
"rewards/frontier_coverage_0": -0.0011624779552221298,
|
|
"rewards/frontier_coverage_1": -0.0011624779552221298,
|
|
"rewards/frontier_coverage_10": -0.0011624779552221298,
|
|
"rewards/frontier_coverage_15": -0.0011624779552221298,
|
|
"rewards/frontier_coverage_20": -0.0011624779552221298,
|
|
"rewards/frontier_coverage_25": -0.0011624779552221298,
|
|
"rewards/frontier_coverage_5": -0.0011624779552221298,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.010926000401377678,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16735026240348816,
|
|
"signal/accuracy_reward/group_std_mean": 0.21467447876930237,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4138888895511627,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08367513120174408,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08367513120174408,
|
|
"signal/advantage_abs_mean": 0.09893292784690857,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09893292784690857,
|
|
"signal/advantage_pre_scale_std": 0.17071661055088044,
|
|
"signal/advantage_std": 0.17071661055088044,
|
|
"signal/brier_reward/centered_abs_mean": 0.1086144745349884,
|
|
"signal/brier_reward/group_std_mean": 0.141890849173069,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01357680931687355,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01357680931687355,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04100620374083519,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.060381069034338,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005125775467604399,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005125775467604399,
|
|
"signal/format_reward/centered_abs_mean": 0.020198567770421505,
|
|
"signal/format_reward/group_std_mean": 0.03544421307742596,
|
|
"signal/format_reward/group_zero_std_frac": 0.8638888955116272,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.010099283885210752,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.010099283885210752,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.001432186597958207,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.002116669714450836,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.2377915593096985e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.2377915593096985e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.09051385223865509,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.12151473313570023,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.09051385223865509,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.12151473313570023,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.09051385223865509,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.12151473313570023,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09051385223865509,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.12151473313570023,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.09051385223865509,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.12151473313570023,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.09051385223865509,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.12151473313570023,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.09051385223865509,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.12151473313570023,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0014142789412289857,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.00986800417304039,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.014217101410031319,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0012335005216300488,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0012335005216300488,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.23999700003749952,
|
|
"eval_completions/clipped_ratio": 0.013020833333333334,
|
|
"eval_completions/max_length": 2601.6666666666665,
|
|
"eval_completions/max_terminated_length": 2601.6666666666665,
|
|
"eval_completions/mean_length": 658.6519266764323,
|
|
"eval_completions/mean_terminated_length": 667.3694864908854,
|
|
"eval_completions/min_length": 49.166666666666664,
|
|
"eval_completions/min_terminated_length": 235.33333333333334,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 208380890.0,
|
|
"eval_reward": 1.0273559093475342,
|
|
"eval_reward_std": 0.2644694770375888,
|
|
"eval_rewards/accuracy_reward": 0.65625,
|
|
"eval_rewards/brier_reward": 0.7910055716832479,
|
|
"eval_rewards/confidence_uniqueness_reward": 0.8701359728972117,
|
|
"eval_rewards/format_reward": 0.9861111144224802,
|
|
"eval_rewards/frontier_aurc_reward": -0.0019449660709748666,
|
|
"eval_rewards/frontier_coverage_0": 9.137632635732491e-05,
|
|
"eval_rewards/frontier_coverage_1": 9.137632635732491e-05,
|
|
"eval_rewards/frontier_coverage_10": 9.137632635732491e-05,
|
|
"eval_rewards/frontier_coverage_15": 9.137632635732491e-05,
|
|
"eval_rewards/frontier_coverage_20": 9.137632635732491e-05,
|
|
"eval_rewards/frontier_coverage_25": 9.137632635732491e-05,
|
|
"eval_rewards/frontier_coverage_5": 9.137632635732491e-05,
|
|
"eval_rewards/true_frontier_ece_gap_only_reward": -0.011575784999877214,
|
|
"eval_runtime": 208.6217,
|
|
"eval_samples_per_second": 4.793,
|
|
"eval_signal/accuracy_reward/centered_abs_mean": 0.4365234325329463,
|
|
"eval_signal/accuracy_reward/group_std_mean": 0.47344937423865,
|
|
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21826171626647314,
|
|
"eval_signal/accuracy_reward/weight": 0.5,
|
|
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21826171626647314,
|
|
"eval_signal/advantage_abs_mean": 0.23294218629598618,
|
|
"eval_signal/advantage_pre_scale_abs_mean": 0.23294218629598618,
|
|
"eval_signal/advantage_pre_scale_std": 0.26374371101458866,
|
|
"eval_signal/advantage_std": 0.26374371101458866,
|
|
"eval_signal/brier_reward/centered_abs_mean": 0.17583947877089182,
|
|
"eval_signal/brier_reward/group_std_mean": 0.2251211479306221,
|
|
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.021979934846361477,
|
|
"eval_signal/brier_reward/weight": 0.125,
|
|
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.021979934846361477,
|
|
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.06556083882848422,
|
|
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.09988817572593689,
|
|
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008195104853560528,
|
|
"eval_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008195104853560528,
|
|
"eval_signal/format_reward/centered_abs_mean": 0.026258680348594982,
|
|
"eval_signal/format_reward/group_std_mean": 0.0630940409998099,
|
|
"eval_signal/format_reward/group_zero_std_frac": 0.6944444676240286,
|
|
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.013129340174297491,
|
|
"eval_signal/format_reward/weight": 0.5,
|
|
"eval_signal/format_reward/weighted_centered_abs_mean": 0.013129340174297491,
|
|
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.0024230304018904767,
|
|
"eval_signal/frontier_aurc_reward/group_std_mean": 0.003932161644722025,
|
|
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.78598500295387e-05,
|
|
"eval_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.78598500295387e-05,
|
|
"eval_signal/frontier_coverage_0/centered_abs_mean": 0.12514647220571837,
|
|
"eval_signal/frontier_coverage_0/group_std_mean": 0.1818079153696696,
|
|
"eval_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_0/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_1/centered_abs_mean": 0.12514647220571837,
|
|
"eval_signal/frontier_coverage_1/group_std_mean": 0.1818079153696696,
|
|
"eval_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_1/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_10/centered_abs_mean": 0.12514647220571837,
|
|
"eval_signal/frontier_coverage_10/group_std_mean": 0.1818079153696696,
|
|
"eval_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_10/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_15/centered_abs_mean": 0.12514647220571837,
|
|
"eval_signal/frontier_coverage_15/group_std_mean": 0.1818079153696696,
|
|
"eval_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_15/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_20/centered_abs_mean": 0.12514647220571837,
|
|
"eval_signal/frontier_coverage_20/group_std_mean": 0.1818079153696696,
|
|
"eval_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_20/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_25/centered_abs_mean": 0.12514647220571837,
|
|
"eval_signal/frontier_coverage_25/group_std_mean": 0.1818079153696696,
|
|
"eval_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_25/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_5/centered_abs_mean": 0.12514647220571837,
|
|
"eval_signal/frontier_coverage_5/group_std_mean": 0.1818079153696696,
|
|
"eval_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/frontier_coverage_5/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0019554136282143495,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.01118487554291884,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.017368461936712265,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.001398109442864855,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.001398109442864855,
|
|
"eval_steps_per_second": 0.029,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.23999700003749952,
|
|
"step": 100,
|
|
"train_probe_completions/clipped_ratio": 0.017187500000000022,
|
|
"train_probe_completions/max_length": 2913.6666666666665,
|
|
"train_probe_completions/max_terminated_length": 2913.6666666666665,
|
|
"train_probe_completions/mean_length": 669.9316202799479,
|
|
"train_probe_completions/mean_terminated_length": 681.6310424804688,
|
|
"train_probe_completions/min_length": 0.0,
|
|
"train_probe_completions/min_terminated_length": 213.83333333333334,
|
|
"train_probe_loss": 0.0,
|
|
"train_probe_num_tokens": 208380890.0,
|
|
"train_probe_reward": 1.0431965788205464,
|
|
"train_probe_reward_std": 0.2604084312915802,
|
|
"train_probe_rewards/accuracy_reward": 0.6848958233992258,
|
|
"train_probe_rewards/brier_reward": 0.8070287605126699,
|
|
"train_probe_rewards/confidence_uniqueness_reward": 0.8711295028527578,
|
|
"train_probe_rewards/format_reward": 0.9852430621782938,
|
|
"train_probe_rewards/frontier_aurc_reward": -0.001599111206208666,
|
|
"train_probe_rewards/frontier_coverage_0": -0.00111961656754526,
|
|
"train_probe_rewards/frontier_coverage_1": -0.00111961656754526,
|
|
"train_probe_rewards/frontier_coverage_10": -0.00111961656754526,
|
|
"train_probe_rewards/frontier_coverage_15": -0.00111961656754526,
|
|
"train_probe_rewards/frontier_coverage_20": -0.00111961656754526,
|
|
"train_probe_rewards/frontier_coverage_25": -0.00111961656754526,
|
|
"train_probe_rewards/frontier_coverage_5": -0.00111961656754526,
|
|
"train_probe_rewards/true_frontier_ece_gap_only_reward": -0.011961817430953184,
|
|
"train_probe_runtime": 210.4634,
|
|
"train_probe_samples_per_second": 4.751,
|
|
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.4185655365387599,
|
|
"train_probe_signal/accuracy_reward/group_std_mean": 0.4635271529356639,
|
|
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.20928276826937994,
|
|
"train_probe_signal/accuracy_reward/weight": 0.5,
|
|
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.20928276826937994,
|
|
"train_probe_signal/advantage_abs_mean": 0.22382708390553793,
|
|
"train_probe_signal/advantage_pre_scale_abs_mean": 0.22382708390553793,
|
|
"train_probe_signal/advantage_pre_scale_std": 0.2600322514772415,
|
|
"train_probe_signal/advantage_std": 0.2600322514772415,
|
|
"train_probe_signal/brier_reward/centered_abs_mean": 0.16734372824430466,
|
|
"train_probe_signal/brier_reward/group_std_mean": 0.21692882478237152,
|
|
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.020917966030538082,
|
|
"train_probe_signal/brier_reward/weight": 0.125,
|
|
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.020917966030538082,
|
|
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.06453707938392957,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.10520645851890247,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008067134922991196,
|
|
"train_probe_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008067134922991196,
|
|
"train_probe_signal/format_reward/centered_abs_mean": 0.02783203125,
|
|
"train_probe_signal/format_reward/group_std_mean": 0.0701802521944046,
|
|
"train_probe_signal/format_reward/group_zero_std_frac": 0.6388889104127884,
|
|
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.013916015625,
|
|
"train_probe_signal/format_reward/weight": 0.5,
|
|
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.013916015625,
|
|
"train_probe_signal/frontier_aurc_reward/centered_abs_mean": 0.002024289375791947,
|
|
"train_probe_signal/frontier_aurc_reward/group_std_mean": 0.0032714407813424864,
|
|
"train_probe_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.162952149674917e-05,
|
|
"train_probe_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"train_probe_signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.162952149674917e-05,
|
|
"train_probe_signal/frontier_coverage_0/centered_abs_mean": 0.12542071690162024,
|
|
"train_probe_signal/frontier_coverage_0/group_std_mean": 0.18846788754065832,
|
|
"train_probe_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_0/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_1/centered_abs_mean": 0.12542071690162024,
|
|
"train_probe_signal/frontier_coverage_1/group_std_mean": 0.18846788754065832,
|
|
"train_probe_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_1/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_10/centered_abs_mean": 0.12542071690162024,
|
|
"train_probe_signal/frontier_coverage_10/group_std_mean": 0.18846788754065832,
|
|
"train_probe_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_10/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_15/centered_abs_mean": 0.12542071690162024,
|
|
"train_probe_signal/frontier_coverage_15/group_std_mean": 0.18846788754065832,
|
|
"train_probe_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_15/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_20/centered_abs_mean": 0.12542071690162024,
|
|
"train_probe_signal/frontier_coverage_20/group_std_mean": 0.18846788754065832,
|
|
"train_probe_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_20/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_25/centered_abs_mean": 0.12542071690162024,
|
|
"train_probe_signal/frontier_coverage_25/group_std_mean": 0.18846788754065832,
|
|
"train_probe_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_25/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_5/centered_abs_mean": 0.12542071690162024,
|
|
"train_probe_signal/frontier_coverage_5/group_std_mean": 0.18846788754065832,
|
|
"train_probe_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/frontier_coverage_5/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0019596987015878162,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.011922950390726328,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.018872848711907864,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.001490368798840791,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.001490368798840791,
|
|
"train_probe_steps_per_second": 0.029
|
|
},
|
|
{
|
|
"calibration/aurc": 0.3267044793348773,
|
|
"calibration/batch_distribution_entropy": 0.7514441882328864,
|
|
"calibration/batch_entropy_100bins": 0.8095879565010626,
|
|
"calibration/batch_entropy_10bins": 0.7514441882328864,
|
|
"calibration/batch_entropy_50bins": 0.8265329826402684,
|
|
"calibration/batch_uniqueness": 0.9404129011408552,
|
|
"calibration/buffer_distribution_entropy": 0.8180824212746325,
|
|
"calibration/buffer_entropy_100bins": 0.7960694640163056,
|
|
"calibration/buffer_entropy_10bins": 0.8180824212746325,
|
|
"calibration/buffer_entropy_50bins": 0.8335788091827643,
|
|
"calibration/confidence_entropy": 0.5574560509219397,
|
|
"calibration/coverage@0%": 0.00994418656056587,
|
|
"calibration/coverage@1%": 0.00994418656056587,
|
|
"calibration/coverage@10%": 0.14097866931918657,
|
|
"calibration/coverage@15%": 0.1568937886825818,
|
|
"calibration/coverage@20%": 0.2546232650242107,
|
|
"calibration/coverage@25%": 0.3336462535299578,
|
|
"calibration/coverage@30%": 0.389026431209603,
|
|
"calibration/coverage@5%": 0.10649591069849691,
|
|
"calibration/distribution_entropy_10": 0.7514441882328864,
|
|
"calibration/distribution_entropy_100": 0.8095879565010626,
|
|
"calibration/ece": 0.14820278446655621,
|
|
"calibration/mean_confidence": 0.6889263632869203,
|
|
"calibration/unique_confidence_per_question": 0.175,
|
|
"calibration/unique_confidences": 67.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.012065972222222231,
|
|
"completions/max_length": 3604.2,
|
|
"completions/max_terminated_length": 3604.2,
|
|
"completions/mean_length": 673.2842163085937,
|
|
"completions/mean_terminated_length": 681.5394897460938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 179.0,
|
|
"epoch": 0.2519968500393745,
|
|
"grad_norm": 0.0004055156314279884,
|
|
"learning_rate": 3.1024096385542172e-06,
|
|
"loss": -0.0102,
|
|
"num_tokens": 219213988.0,
|
|
"reward": 1.044841742515564,
|
|
"reward_std": 0.12685696184635162,
|
|
"rewards/accuracy_reward": 0.6730902671813965,
|
|
"rewards/brier_reward": 0.7991919994354248,
|
|
"rewards/confidence_uniqueness_reward": 0.9291059017181397,
|
|
"rewards/format_reward": 0.9876736044883728,
|
|
"rewards/frontier_aurc_reward": -0.001739606261253357,
|
|
"rewards/frontier_coverage_0": -0.001969197951257229,
|
|
"rewards/frontier_coverage_1": -0.001969197951257229,
|
|
"rewards/frontier_coverage_10": -0.001969197951257229,
|
|
"rewards/frontier_coverage_15": -0.001969197951257229,
|
|
"rewards/frontier_coverage_20": -0.001969197951257229,
|
|
"rewards/frontier_coverage_25": -0.001969197951257229,
|
|
"rewards/frontier_coverage_5": -0.001969197951257229,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.010678962059319019,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.15438368171453476,
|
|
"signal/accuracy_reward/group_std_mean": 0.20449974834918977,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.42222222685813904,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07719184085726738,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07719184085726738,
|
|
"signal/advantage_abs_mean": 0.09131217449903488,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09131217449903488,
|
|
"signal/advantage_pre_scale_std": 0.16192201673984527,
|
|
"signal/advantage_std": 0.16192201673984527,
|
|
"signal/brier_reward/centered_abs_mean": 0.1098570004105568,
|
|
"signal/brier_reward/group_std_mean": 0.14484120011329651,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0137321250513196,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.0137321250513196,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04084142223000527,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06246491596102714,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005105177778750658,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005105177778750658,
|
|
"signal/format_reward/centered_abs_mean": 0.01923828125,
|
|
"signal/format_reward/group_std_mean": 0.03651031218469143,
|
|
"signal/format_reward/group_zero_std_frac": 0.850000011920929,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.009619140625,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.009619140625,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0014394932892173528,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0021645855624228714,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.2492082644021137e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.2492082644021137e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.0979076936841011,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.13328861594200134,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.0979076936841011,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.13328861594200134,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.0979076936841011,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.13328861594200134,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.0979076936841011,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.13328861594200134,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.0979076936841011,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.13328861594200134,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.0979076936841011,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.13328861594200134,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.0979076936841011,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.13328861594200134,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0015298077138140797,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.009452897682785988,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.014724508672952653,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0011816122103482484,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0011816122103482484,
|
|
"step": 105
|
|
},
|
|
{
|
|
"calibration/aurc": 0.19653911867283705,
|
|
"calibration/batch_distribution_entropy": 0.742189294691922,
|
|
"calibration/batch_entropy_100bins": 0.8103696123897912,
|
|
"calibration/batch_entropy_10bins": 0.742189294691922,
|
|
"calibration/batch_entropy_50bins": 0.8205309171620865,
|
|
"calibration/batch_uniqueness": 0.9383345760593518,
|
|
"calibration/buffer_distribution_entropy": 0.8196634901938914,
|
|
"calibration/buffer_entropy_100bins": 0.8046767967346928,
|
|
"calibration/buffer_entropy_10bins": 0.8196634901938914,
|
|
"calibration/buffer_entropy_50bins": 0.8391647012059913,
|
|
"calibration/confidence_entropy": 0.5437583638987209,
|
|
"calibration/coverage@0%": 0.017870423903897183,
|
|
"calibration/coverage@1%": 0.017870423903897183,
|
|
"calibration/coverage@10%": 0.14836196586429348,
|
|
"calibration/coverage@15%": 0.2993953758400066,
|
|
"calibration/coverage@20%": 0.5307628916308488,
|
|
"calibration/coverage@25%": 0.7679343830030556,
|
|
"calibration/coverage@30%": 0.9234316719507646,
|
|
"calibration/coverage@5%": 0.04352487416567728,
|
|
"calibration/distribution_entropy_10": 0.742189294691922,
|
|
"calibration/distribution_entropy_100": 0.8103696123897912,
|
|
"calibration/ece": 0.09015899744059616,
|
|
"calibration/mean_confidence": 0.6941812575865814,
|
|
"calibration/unique_confidence_per_question": 0.19114583333333332,
|
|
"calibration/unique_confidences": 73.4,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.009982638888888905,
|
|
"completions/max_length": 3548.6,
|
|
"completions/max_terminated_length": 3548.6,
|
|
"completions/mean_length": 683.9628540039063,
|
|
"completions/mean_terminated_length": 690.8879516601562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 193.6,
|
|
"epoch": 0.2639967000412495,
|
|
"grad_norm": 0.0004070218128617853,
|
|
"learning_rate": 2.9518072289156627e-06,
|
|
"loss": -0.0086,
|
|
"num_tokens": 230201688.0,
|
|
"reward": 1.0639292001724243,
|
|
"reward_std": 0.12081557959318161,
|
|
"rewards/accuracy_reward": 0.7053819298744202,
|
|
"rewards/brier_reward": 0.8154799818992615,
|
|
"rewards/confidence_uniqueness_reward": 0.929580807685852,
|
|
"rewards/format_reward": 0.989843738079071,
|
|
"rewards/frontier_aurc_reward": -0.001527873962186277,
|
|
"rewards/frontier_coverage_0": -0.005524499481543899,
|
|
"rewards/frontier_coverage_1": -0.005524499481543899,
|
|
"rewards/frontier_coverage_10": -0.005524499481543899,
|
|
"rewards/frontier_coverage_15": -0.005524499481543899,
|
|
"rewards/frontier_coverage_20": -0.005524499481543899,
|
|
"rewards/frontier_coverage_25": -0.005524499481543899,
|
|
"rewards/frontier_coverage_5": -0.005524499481543899,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.009504916891455651,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.14922960102558136,
|
|
"signal/accuracy_reward/group_std_mean": 0.20320949256420134,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07461480051279068,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07461480051279068,
|
|
"signal/advantage_abs_mean": 0.0867169290781021,
|
|
"signal/advantage_pre_scale_abs_mean": 0.0867169290781021,
|
|
"signal/advantage_pre_scale_std": 0.158852681517601,
|
|
"signal/advantage_std": 0.158852681517601,
|
|
"signal/brier_reward/centered_abs_mean": 0.10190331041812897,
|
|
"signal/brier_reward/group_std_mean": 0.13416367769241333,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.012737913802266122,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.012737913802266122,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03820802196860314,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05503500029444695,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004776002746075392,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004776002746075392,
|
|
"signal/format_reward/centered_abs_mean": 0.01648763045668602,
|
|
"signal/format_reward/group_std_mean": 0.029129663482308388,
|
|
"signal/format_reward/group_zero_std_frac": 0.8833333373069763,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00824381522834301,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.00824381522834301,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0013106558239087463,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.001980750821530819,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.047899724857416e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.047899724857416e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.09209920465946198,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.12662244141101836,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.09209920465946198,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.12662244141101836,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.09209920465946198,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.12662244141101836,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09209920465946198,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.12662244141101836,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.09209920465946198,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.12662244141101836,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.09209920465946198,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.12662244141101836,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.09209920465946198,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.12662244141101836,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0014390500728040934,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.008595239371061325,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.01369424220174551,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0010744049213826656,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0010744049213826656,
|
|
"step": 110
|
|
},
|
|
{
|
|
"calibration/aurc": 0.30524022173226106,
|
|
"calibration/batch_distribution_entropy": 0.7818711531082017,
|
|
"calibration/batch_entropy_100bins": 0.8248857687848549,
|
|
"calibration/batch_entropy_10bins": 0.7818711531082017,
|
|
"calibration/batch_entropy_50bins": 0.8433798093291898,
|
|
"calibration/batch_uniqueness": 0.9433997785726878,
|
|
"calibration/buffer_distribution_entropy": 0.8217132367095644,
|
|
"calibration/buffer_entropy_100bins": 0.8122731904636383,
|
|
"calibration/buffer_entropy_10bins": 0.8217132367095644,
|
|
"calibration/buffer_entropy_50bins": 0.8442563490493079,
|
|
"calibration/confidence_entropy": 0.5560049283881041,
|
|
"calibration/coverage@0%": 0.006447631273640926,
|
|
"calibration/coverage@1%": 0.006447631273640926,
|
|
"calibration/coverage@10%": 0.006447631273640926,
|
|
"calibration/coverage@15%": 0.14715146010246977,
|
|
"calibration/coverage@20%": 0.4041453376318914,
|
|
"calibration/coverage@25%": 0.5688995513801839,
|
|
"calibration/coverage@30%": 0.6564715896999361,
|
|
"calibration/coverage@5%": 0.006447631273640926,
|
|
"calibration/distribution_entropy_10": 0.7818711531082017,
|
|
"calibration/distribution_entropy_100": 0.8248857687848549,
|
|
"calibration/ece": 0.15221433813065566,
|
|
"calibration/mean_confidence": 0.6707838972409772,
|
|
"calibration/unique_confidence_per_question": 0.18958333333333335,
|
|
"calibration/unique_confidences": 72.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.017708333333333326,
|
|
"completions/max_length": 3419.2,
|
|
"completions/max_terminated_length": 3419.2,
|
|
"completions/mean_length": 679.98916015625,
|
|
"completions/mean_terminated_length": 692.3475830078125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 207.8,
|
|
"epoch": 0.27599655004312446,
|
|
"grad_norm": 0.00038507970748469234,
|
|
"learning_rate": 2.8012048192771087e-06,
|
|
"loss": -0.0156,
|
|
"num_tokens": 241114363.0,
|
|
"reward": 1.0363726139068603,
|
|
"reward_std": 0.13486612737178802,
|
|
"rewards/accuracy_reward": 0.6633680582046508,
|
|
"rewards/brier_reward": 0.793210256099701,
|
|
"rewards/confidence_uniqueness_reward": 0.9223326444625854,
|
|
"rewards/format_reward": 0.9822916626930237,
|
|
"rewards/frontier_aurc_reward": -0.0019153600791469217,
|
|
"rewards/frontier_coverage_0": 0.003437680657953024,
|
|
"rewards/frontier_coverage_1": 0.003437680657953024,
|
|
"rewards/frontier_coverage_10": 0.003437680657953024,
|
|
"rewards/frontier_coverage_15": 0.003437680657953024,
|
|
"rewards/frontier_coverage_20": 0.003437680657953024,
|
|
"rewards/frontier_coverage_25": 0.003437680657953024,
|
|
"rewards/frontier_coverage_5": 0.003437680657953024,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.009969686530530453,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16082899123430253,
|
|
"signal/accuracy_reward/group_std_mean": 0.2069980025291443,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4277777850627899,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08041449561715126,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08041449561715126,
|
|
"signal/advantage_abs_mean": 0.09912077933549882,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09912077933549882,
|
|
"signal/advantage_pre_scale_std": 0.1765537291765213,
|
|
"signal/advantage_std": 0.1765537291765213,
|
|
"signal/brier_reward/centered_abs_mean": 0.11570018827915192,
|
|
"signal/brier_reward/group_std_mean": 0.151495760679245,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01446252353489399,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01446252353489399,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04762213602662087,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.07307658642530442,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0059527670033276085,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0059527670033276085,
|
|
"signal/format_reward/centered_abs_mean": 0.02702907994389534,
|
|
"signal/format_reward/group_std_mean": 0.04844924733042717,
|
|
"signal/format_reward/group_zero_std_frac": 0.8111111164093018,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01351453997194767,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.01351453997194767,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0016357111278921365,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.002513893973082304,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.5557986373314633e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.5557986373314633e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.09530313909053803,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.12956467568874358,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.09530313909053803,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.12956467568874358,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.09530313909053803,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.12956467568874358,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09530313909053803,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.12956467568874358,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.09530313909053803,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.12956467568874358,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.09530313909053803,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.12956467568874358,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.09530313909053803,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.12956467568874358,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0014891115482896567,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.00945689920336008,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.01578503046184778,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00118211240042001,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00118211240042001,
|
|
"step": 115
|
|
},
|
|
{
|
|
"calibration/aurc": 0.2829972784176247,
|
|
"calibration/batch_distribution_entropy": 0.7978632783074413,
|
|
"calibration/batch_entropy_100bins": 0.8326413576750176,
|
|
"calibration/batch_entropy_10bins": 0.7978632783074413,
|
|
"calibration/batch_entropy_50bins": 0.8484370752003694,
|
|
"calibration/batch_uniqueness": 0.9428754810521134,
|
|
"calibration/buffer_distribution_entropy": 0.8256495412935981,
|
|
"calibration/buffer_entropy_100bins": 0.8198335167796709,
|
|
"calibration/buffer_entropy_10bins": 0.8256495412935981,
|
|
"calibration/buffer_entropy_50bins": 0.8499243478716128,
|
|
"calibration/confidence_entropy": 0.5454490383451711,
|
|
"calibration/coverage@0%": 0.024511889179755674,
|
|
"calibration/coverage@1%": 0.024511889179755674,
|
|
"calibration/coverage@10%": 0.18489583333333334,
|
|
"calibration/coverage@15%": 0.3580170157068063,
|
|
"calibration/coverage@20%": 0.47328206806282724,
|
|
"calibration/coverage@25%": 0.5411131108202444,
|
|
"calibration/coverage@30%": 0.5953125,
|
|
"calibration/coverage@5%": 0.04899105584642234,
|
|
"calibration/distribution_entropy_10": 0.7978632783074413,
|
|
"calibration/distribution_entropy_100": 0.8326413576750176,
|
|
"calibration/ece": 0.14527504044433004,
|
|
"calibration/mean_confidence": 0.6648941085090072,
|
|
"calibration/unique_confidence_per_question": 0.2046875,
|
|
"calibration/unique_confidences": 78.6,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.011024305555555558,
|
|
"completions/max_length": 3318.6,
|
|
"completions/max_terminated_length": 3318.6,
|
|
"completions/mean_length": 686.3271728515625,
|
|
"completions/mean_terminated_length": 693.9168334960938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 229.2,
|
|
"epoch": 0.28799640004499943,
|
|
"grad_norm": 0.0003715228522196412,
|
|
"learning_rate": 2.6506024096385547e-06,
|
|
"loss": -0.009,
|
|
"num_tokens": 252102708.0,
|
|
"reward": 1.048642134666443,
|
|
"reward_std": 0.12622617483139037,
|
|
"rewards/accuracy_reward": 0.6764756917953492,
|
|
"rewards/brier_reward": 0.8032851457595825,
|
|
"rewards/confidence_uniqueness_reward": 0.9316941499710083,
|
|
"rewards/format_reward": 0.9888888835906983,
|
|
"rewards/frontier_aurc_reward": -0.001726908260025084,
|
|
"rewards/frontier_coverage_0": 0.0026118648587726057,
|
|
"rewards/frontier_coverage_1": 0.0026118648587726057,
|
|
"rewards/frontier_coverage_10": 0.0026118648587726057,
|
|
"rewards/frontier_coverage_15": 0.0026118648587726057,
|
|
"rewards/frontier_coverage_20": 0.0026118648587726057,
|
|
"rewards/frontier_coverage_25": 0.0026118648587726057,
|
|
"rewards/frontier_coverage_5": 0.0026118648587726057,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.009370057098567485,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16347113847732545,
|
|
"signal/accuracy_reward/group_std_mean": 0.21460457444190978,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.38888890147209165,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08173556923866272,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08173556923866272,
|
|
"signal/advantage_abs_mean": 0.09389262199401856,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09389262199401856,
|
|
"signal/advantage_pre_scale_std": 0.16233381628990173,
|
|
"signal/advantage_std": 0.16233381628990173,
|
|
"signal/brier_reward/centered_abs_mean": 0.10937037020921707,
|
|
"signal/brier_reward/group_std_mean": 0.14254879355430602,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013671296276152134,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013671296276152134,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03843596838414669,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.057198996841907504,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004804496048018336,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004804496048018336,
|
|
"signal/format_reward/centered_abs_mean": 0.017404513992369176,
|
|
"signal/format_reward/group_std_mean": 0.032134901732206345,
|
|
"signal/format_reward/group_zero_std_frac": 0.8666666626930237,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.008702256996184588,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.008702256996184588,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0014132563257589937,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0021206842735409736,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.2082130089984276e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.2082130089984276e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.10615848153829574,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.14436171054840088,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.10615848153829574,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.14436171054840088,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.10615848153829574,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.14436171054840088,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.10615848153829574,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.14436171054840088,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.10615848153829574,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.14436171054840088,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.10615848153829574,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.14436171054840088,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.10615848153829574,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.14436171054840088,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001658726274035871,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.008557920716702938,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.014681273698806762,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0010697400895878673,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0010697400895878673,
|
|
"step": 120
|
|
},
|
|
{
|
|
"calibration/aurc": 0.19671941465770884,
|
|
"calibration/batch_distribution_entropy": 0.714069251213561,
|
|
"calibration/batch_entropy_100bins": 0.79093280229107,
|
|
"calibration/batch_entropy_10bins": 0.714069251213561,
|
|
"calibration/batch_entropy_50bins": 0.7976220273158751,
|
|
"calibration/batch_uniqueness": 0.9272707791642439,
|
|
"calibration/buffer_distribution_entropy": 0.8283623145695502,
|
|
"calibration/buffer_entropy_100bins": 0.8261738795237367,
|
|
"calibration/buffer_entropy_10bins": 0.8283623145695502,
|
|
"calibration/buffer_entropy_50bins": 0.8545030151086038,
|
|
"calibration/confidence_entropy": 0.5104855330872758,
|
|
"calibration/coverage@0%": 0.014166491343532641,
|
|
"calibration/coverage@1%": 0.014166491343532641,
|
|
"calibration/coverage@10%": 0.31885399134353265,
|
|
"calibration/coverage@15%": 0.3815390808752131,
|
|
"calibration/coverage@20%": 0.4841432475418797,
|
|
"calibration/coverage@25%": 0.8064810181594071,
|
|
"calibration/coverage@30%": 0.9008741303877912,
|
|
"calibration/coverage@5%": 0.014166491343532641,
|
|
"calibration/distribution_entropy_10": 0.714069251213561,
|
|
"calibration/distribution_entropy_100": 0.79093280229107,
|
|
"calibration/ece": 0.09618043935237061,
|
|
"calibration/mean_confidence": 0.7132224626554917,
|
|
"calibration/unique_confidence_per_question": 0.1828125,
|
|
"calibration/unique_confidences": 70.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.012239583333333304,
|
|
"completions/max_length": 3284.4,
|
|
"completions/max_terminated_length": 3284.4,
|
|
"completions/mean_length": 691.3783081054687,
|
|
"completions/mean_terminated_length": 700.0879638671875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 221.8,
|
|
"epoch": 0.2999962500468744,
|
|
"grad_norm": 0.00044733521644957364,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": -0.0113,
|
|
"num_tokens": 263185050.0,
|
|
"reward": 1.055611777305603,
|
|
"reward_std": 0.1275094196200371,
|
|
"rewards/accuracy_reward": 0.6895833253860474,
|
|
"rewards/brier_reward": 0.8160496830940247,
|
|
"rewards/confidence_uniqueness_reward": 0.92153559923172,
|
|
"rewards/format_reward": 0.9876736164093017,
|
|
"rewards/frontier_aurc_reward": -0.0017230862518772482,
|
|
"rewards/frontier_coverage_0": 0.01225762339308858,
|
|
"rewards/frontier_coverage_1": 0.01225762339308858,
|
|
"rewards/frontier_coverage_10": 0.01225762339308858,
|
|
"rewards/frontier_coverage_15": 0.01225762339308858,
|
|
"rewards/frontier_coverage_20": 0.01225762339308858,
|
|
"rewards/frontier_coverage_25": 0.01225762339308858,
|
|
"rewards/frontier_coverage_5": 0.01225762339308858,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.012229060940444469,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16138237714767456,
|
|
"signal/accuracy_reward/group_std_mean": 0.21225160956382752,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.397222226858139,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08069118857383728,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08069118857383728,
|
|
"signal/advantage_abs_mean": 0.0945181205868721,
|
|
"signal/advantage_pre_scale_abs_mean": 0.0945181205868721,
|
|
"signal/advantage_pre_scale_std": 0.1663988560438156,
|
|
"signal/advantage_std": 0.1663988560438156,
|
|
"signal/brier_reward/centered_abs_mean": 0.11373110711574555,
|
|
"signal/brier_reward/group_std_mean": 0.1483635872602463,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014216388389468194,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014216388389468194,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04427947551012039,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.062184395641088484,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005534934438765049,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005534934438765049,
|
|
"signal/format_reward/centered_abs_mean": 0.018576388712972403,
|
|
"signal/format_reward/group_std_mean": 0.031818334758281705,
|
|
"signal/format_reward/group_zero_std_frac": 0.8777777791023255,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.009288194356486201,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.009288194356486201,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0016881852876394986,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0025606358423829077,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.6377895119367166e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.6377895119367166e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.0979716956615448,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1332421526312828,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.0979716956615448,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1332421526312828,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.0979716956615448,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1332421526312828,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.0979716956615448,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.1332421526312828,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.0979716956615448,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.1332421526312828,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.0979716956615448,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.1332421526312828,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.0979716956615448,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1332421526312828,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0015308077447116375,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.01217461358755827,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.02020731884986162,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0015218266984447838,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0015218266984447838,
|
|
"step": 125
|
|
},
|
|
{
|
|
"calibration/aurc": 0.26231922913161954,
|
|
"calibration/batch_distribution_entropy": 0.7849330143793635,
|
|
"calibration/batch_entropy_100bins": 0.8313458268638213,
|
|
"calibration/batch_entropy_10bins": 0.7849330143793635,
|
|
"calibration/batch_entropy_50bins": 0.8469249993347873,
|
|
"calibration/batch_uniqueness": 0.9375108076362693,
|
|
"calibration/buffer_distribution_entropy": 0.8294780083702189,
|
|
"calibration/buffer_entropy_100bins": 0.8315937057293505,
|
|
"calibration/buffer_entropy_10bins": 0.8294780083702189,
|
|
"calibration/buffer_entropy_50bins": 0.8582865661934059,
|
|
"calibration/confidence_entropy": 0.5124842329428232,
|
|
"calibration/coverage@0%": 0.012623205773998373,
|
|
"calibration/coverage@1%": 0.012623205773998373,
|
|
"calibration/coverage@10%": 0.1614250792927287,
|
|
"calibration/coverage@15%": 0.2572956123196314,
|
|
"calibration/coverage@20%": 0.3132771916457143,
|
|
"calibration/coverage@25%": 0.43678116869105654,
|
|
"calibration/coverage@30%": 0.8075378762952686,
|
|
"calibration/coverage@5%": 0.022596959054838266,
|
|
"calibration/distribution_entropy_10": 0.7849330143793635,
|
|
"calibration/distribution_entropy_100": 0.8313458268638213,
|
|
"calibration/ece": 0.12743665254843256,
|
|
"calibration/mean_confidence": 0.6790373502611695,
|
|
"calibration/unique_confidence_per_question": 0.21770833333333334,
|
|
"calibration/unique_confidences": 83.6,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.017795138888888885,
|
|
"completions/max_length": 3700.6,
|
|
"completions/max_terminated_length": 3700.6,
|
|
"completions/mean_length": 712.4034790039062,
|
|
"completions/mean_terminated_length": 725.374267578125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 191.2,
|
|
"epoch": 0.3119961000487494,
|
|
"grad_norm": 0.00043796157115139067,
|
|
"learning_rate": 2.349397590361446e-06,
|
|
"loss": -0.0143,
|
|
"num_tokens": 274516738.0,
|
|
"reward": 1.0368208050727845,
|
|
"reward_std": 0.13528352975845337,
|
|
"rewards/accuracy_reward": 0.6607638716697692,
|
|
"rewards/brier_reward": 0.8013606786727905,
|
|
"rewards/confidence_uniqueness_reward": 0.9176180601119995,
|
|
"rewards/format_reward": 0.9821180462837219,
|
|
"rewards/frontier_aurc_reward": -0.001984483632259071,
|
|
"rewards/frontier_coverage_0": 0.01965160174295306,
|
|
"rewards/frontier_coverage_1": 0.01965160174295306,
|
|
"rewards/frontier_coverage_10": 0.01965160174295306,
|
|
"rewards/frontier_coverage_15": 0.01965160174295306,
|
|
"rewards/frontier_coverage_20": 0.01965160174295306,
|
|
"rewards/frontier_coverage_25": 0.01965160174295306,
|
|
"rewards/frontier_coverage_5": 0.01965160174295306,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.012887386418879033,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1666666716337204,
|
|
"signal/accuracy_reward/group_std_mean": 0.21933417916297912,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.3805555641651154,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0833333358168602,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0833333358168602,
|
|
"signal/advantage_abs_mean": 0.09987292736768723,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09987292736768723,
|
|
"signal/advantage_pre_scale_std": 0.17289304733276367,
|
|
"signal/advantage_std": 0.17289304733276367,
|
|
"signal/brier_reward/centered_abs_mean": 0.11932021975517274,
|
|
"signal/brier_reward/group_std_mean": 0.15680161118507385,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014915027469396592,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014915027469396592,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.049394051730632785,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.07107506543397904,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.006174256466329098,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006174256466329098,
|
|
"signal/format_reward/centered_abs_mean": 0.02468533031642437,
|
|
"signal/format_reward/group_std_mean": 0.04145882315933704,
|
|
"signal/format_reward/group_zero_std_frac": 0.8416666746139526,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.012342665158212186,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.012342665158212186,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0018592241685837507,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.002899319725111127,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.9050377634121105e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.9050377634121105e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.10627900362014771,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.14542074501514435,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.10627900362014771,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.14542074501514435,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.10627900362014771,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.14542074501514435,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.10627900362014771,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.14542074501514435,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.10627900362014771,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.14542074501514435,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.10627900362014771,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.14542074501514435,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.10627900362014771,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.14542074501514435,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001660609431564808,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.013442159257829189,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.02378322519361973,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0016802699072286486,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0016802699072286486,
|
|
"step": 130
|
|
},
|
|
{
|
|
"calibration/aurc": 0.2100724966240294,
|
|
"calibration/batch_distribution_entropy": 0.7932919126000467,
|
|
"calibration/batch_entropy_100bins": 0.8307378530163175,
|
|
"calibration/batch_entropy_10bins": 0.7932919126000467,
|
|
"calibration/batch_entropy_50bins": 0.847089723931882,
|
|
"calibration/batch_uniqueness": 0.9421351893583964,
|
|
"calibration/buffer_distribution_entropy": 0.8324469338084997,
|
|
"calibration/buffer_entropy_100bins": 0.8370055455965371,
|
|
"calibration/buffer_entropy_10bins": 0.8324469338084997,
|
|
"calibration/buffer_entropy_50bins": 0.862450153507045,
|
|
"calibration/confidence_entropy": 0.5360752773461647,
|
|
"calibration/coverage@0%": 0.019922239322602862,
|
|
"calibration/coverage@1%": 0.019922239322602862,
|
|
"calibration/coverage@10%": 0.3430041437433335,
|
|
"calibration/coverage@15%": 0.35355823345309606,
|
|
"calibration/coverage@20%": 0.4321721670970639,
|
|
"calibration/coverage@25%": 0.6357222721233556,
|
|
"calibration/coverage@30%": 0.7409831478414641,
|
|
"calibration/coverage@5%": 0.30943497381008883,
|
|
"calibration/distribution_entropy_10": 0.7932919126000467,
|
|
"calibration/distribution_entropy_100": 0.8307378530163175,
|
|
"calibration/ece": 0.13974870453897614,
|
|
"calibration/mean_confidence": 0.6561501450334012,
|
|
"calibration/unique_confidence_per_question": 0.19531250000000003,
|
|
"calibration/unique_confidences": 75.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0109375,
|
|
"completions/max_length": 3228.4,
|
|
"completions/max_terminated_length": 3228.4,
|
|
"completions/mean_length": 692.0186767578125,
|
|
"completions/mean_terminated_length": 699.7394653320313,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 181.8,
|
|
"epoch": 0.32399595005062437,
|
|
"grad_norm": 0.0004282255540601909,
|
|
"learning_rate": 2.1987951807228917e-06,
|
|
"loss": -0.0093,
|
|
"num_tokens": 285581817.0,
|
|
"reward": 1.054863166809082,
|
|
"reward_std": 0.12371634542942048,
|
|
"rewards/accuracy_reward": 0.6837673544883728,
|
|
"rewards/brier_reward": 0.8118860840797424,
|
|
"rewards/confidence_uniqueness_reward": 0.9371923685073853,
|
|
"rewards/format_reward": 0.9890625,
|
|
"rewards/frontier_aurc_reward": -0.0013375790789723397,
|
|
"rewards/frontier_coverage_0": 0.006732956040650606,
|
|
"rewards/frontier_coverage_1": 0.006732956040650606,
|
|
"rewards/frontier_coverage_10": 0.006732956040650606,
|
|
"rewards/frontier_coverage_15": 0.006732956040650606,
|
|
"rewards/frontier_coverage_20": 0.006732956040650606,
|
|
"rewards/frontier_coverage_25": 0.006732956040650606,
|
|
"rewards/frontier_coverage_5": 0.006732956040650606,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.007217477634549141,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.15847981721162796,
|
|
"signal/accuracy_reward/group_std_mean": 0.21153208017349243,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.38888888955116274,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07923990860581398,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07923990860581398,
|
|
"signal/advantage_abs_mean": 0.08863486796617508,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08863486796617508,
|
|
"signal/advantage_pre_scale_std": 0.15808248221874238,
|
|
"signal/advantage_std": 0.15808248221874238,
|
|
"signal/brier_reward/centered_abs_mean": 0.11114487051963806,
|
|
"signal/brier_reward/group_std_mean": 0.14610919654369353,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013893108814954758,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013893108814954758,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03625557161867619,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05811716765165329,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004531946452334523,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004531946452334523,
|
|
"signal/format_reward/centered_abs_mean": 0.01872829869389534,
|
|
"signal/format_reward/group_std_mean": 0.036873598024249075,
|
|
"signal/format_reward/group_zero_std_frac": 0.8416666626930237,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00936414934694767,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.00936414934694767,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0011374737368896604,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0018357637338340283,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.7773027138900943e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.7773027138900943e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.12717922925949096,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.17108558714389802,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.12717922925949096,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.17108558714389802,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12717922925949096,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.17108558714389802,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.12717922925949096,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.17108558714389802,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.12717922925949096,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.17108558714389802,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.12717922925949096,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.17108558714389802,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.12717922925949096,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.17108558714389802,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0019871754571795463,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.006792500615119934,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.012135511264204979,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0008490625768899918,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0008490625768899918,
|
|
"step": 135
|
|
},
|
|
{
|
|
"calibration/aurc": 0.12316983748625196,
|
|
"calibration/batch_distribution_entropy": 0.8033321196483671,
|
|
"calibration/batch_entropy_100bins": 0.840970506147876,
|
|
"calibration/batch_entropy_10bins": 0.8033321196483671,
|
|
"calibration/batch_entropy_50bins": 0.8595641216344057,
|
|
"calibration/batch_uniqueness": 0.9441387779910293,
|
|
"calibration/buffer_distribution_entropy": 0.8356524945448074,
|
|
"calibration/buffer_entropy_100bins": 0.844583354188865,
|
|
"calibration/buffer_entropy_10bins": 0.8356524945448074,
|
|
"calibration/buffer_entropy_50bins": 0.8676924896189366,
|
|
"calibration/confidence_entropy": 0.5414429455483087,
|
|
"calibration/coverage@0%": 0.024367036493294764,
|
|
"calibration/coverage@1%": 0.024367036493294764,
|
|
"calibration/coverage@10%": 0.5015683248215262,
|
|
"calibration/coverage@15%": 0.7200879000764464,
|
|
"calibration/coverage@20%": 0.806558927378318,
|
|
"calibration/coverage@25%": 0.887908015749872,
|
|
"calibration/coverage@30%": 0.9627324607905436,
|
|
"calibration/coverage@5%": 0.21789834997139393,
|
|
"calibration/distribution_entropy_10": 0.8033321196483671,
|
|
"calibration/distribution_entropy_100": 0.840970506147876,
|
|
"calibration/ece": 0.12557287187845612,
|
|
"calibration/mean_confidence": 0.6414325364057707,
|
|
"calibration/unique_confidence_per_question": 0.20520833333333335,
|
|
"calibration/unique_confidences": 78.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.012847222222222232,
|
|
"completions/max_length": 3215.0,
|
|
"completions/max_terminated_length": 3215.0,
|
|
"completions/mean_length": 675.0425415039062,
|
|
"completions/mean_terminated_length": 683.766796875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 194.2,
|
|
"epoch": 0.33599580005249935,
|
|
"grad_norm": 0.0003816323878709227,
|
|
"learning_rate": 2.0481927710843377e-06,
|
|
"loss": -0.0113,
|
|
"num_tokens": 296462531.0,
|
|
"reward": 1.0525088548660277,
|
|
"reward_std": 0.12087329030036927,
|
|
"rewards/accuracy_reward": 0.681163203716278,
|
|
"rewards/brier_reward": 0.8102995872497558,
|
|
"rewards/confidence_uniqueness_reward": 0.9345194458961487,
|
|
"rewards/format_reward": 0.9871527791023255,
|
|
"rewards/frontier_aurc_reward": -0.0013744331081397831,
|
|
"rewards/frontier_coverage_0": 0.00963379731401801,
|
|
"rewards/frontier_coverage_1": 0.00963379731401801,
|
|
"rewards/frontier_coverage_10": 0.00963379731401801,
|
|
"rewards/frontier_coverage_15": 0.00963379731401801,
|
|
"rewards/frontier_coverage_20": 0.00963379731401801,
|
|
"rewards/frontier_coverage_25": 0.00963379731401801,
|
|
"rewards/frontier_coverage_5": 0.00963379731401801,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.006269952561706304,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.14478624165058135,
|
|
"signal/accuracy_reward/group_std_mean": 0.19590498208999635,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4222222208976746,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07239312082529067,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07239312082529067,
|
|
"signal/advantage_abs_mean": 0.08584622740745544,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08584622740745544,
|
|
"signal/advantage_pre_scale_std": 0.15582461655139923,
|
|
"signal/advantage_std": 0.15582461655139923,
|
|
"signal/brier_reward/centered_abs_mean": 0.11108436435461044,
|
|
"signal/brier_reward/group_std_mean": 0.14652538895606995,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013885545544326305,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013885545544326305,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.039488519355654714,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06021577715873718,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004936064919456839,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004936064919456839,
|
|
"signal/format_reward/centered_abs_mean": 0.020616319216787814,
|
|
"signal/format_reward/group_std_mean": 0.037392809987068176,
|
|
"signal/format_reward/group_zero_std_frac": 0.85,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.010308159608393907,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.010308159608393907,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0012093130266293884,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0018805687082931398,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.8895516041084194e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.8895516041084194e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.12452564984560013,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.17022224068641661,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.12452564984560013,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.17022224068641661,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12452564984560013,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.17022224068641661,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.12452564984560013,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.17022224068641661,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.12452564984560013,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.17022224068641661,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.12452564984560013,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.17022224068641661,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.12452564984560013,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.17022224068641661,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001945713278837502,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0054856881499290465,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.00944354822859168,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0006857110187411308,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0006857110187411308,
|
|
"step": 140
|
|
},
|
|
{
|
|
"calibration/aurc": 0.1707190282167193,
|
|
"calibration/batch_distribution_entropy": 0.7870297352600305,
|
|
"calibration/batch_entropy_100bins": 0.8287951367704729,
|
|
"calibration/batch_entropy_10bins": 0.7870297352600305,
|
|
"calibration/batch_entropy_50bins": 0.8444396351033697,
|
|
"calibration/batch_uniqueness": 0.9381528864738238,
|
|
"calibration/buffer_distribution_entropy": 0.836454052730403,
|
|
"calibration/buffer_entropy_100bins": 0.8535101399161015,
|
|
"calibration/buffer_entropy_10bins": 0.836454052730403,
|
|
"calibration/buffer_entropy_50bins": 0.8728050714670547,
|
|
"calibration/confidence_entropy": 0.5240414918619417,
|
|
"calibration/coverage@0%": 0.026351947388887843,
|
|
"calibration/coverage@1%": 0.026351947388887843,
|
|
"calibration/coverage@10%": 0.41923410430351626,
|
|
"calibration/coverage@15%": 0.5314265164218459,
|
|
"calibration/coverage@20%": 0.620989570820157,
|
|
"calibration/coverage@25%": 0.751987830680877,
|
|
"calibration/coverage@30%": 0.8425736230097444,
|
|
"calibration/coverage@5%": 0.19100326031606285,
|
|
"calibration/distribution_entropy_10": 0.7870297352600305,
|
|
"calibration/distribution_entropy_100": 0.8287951367704729,
|
|
"calibration/ece": 0.1261507501639957,
|
|
"calibration/mean_confidence": 0.6589088571731689,
|
|
"calibration/unique_confidence_per_question": 0.20416666666666666,
|
|
"calibration/unique_confidences": 78.4,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01015625,
|
|
"completions/max_length": 3217.4,
|
|
"completions/max_terminated_length": 3217.4,
|
|
"completions/mean_length": 651.4529541015625,
|
|
"completions/mean_terminated_length": 658.1746948242187,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 206.8,
|
|
"epoch": 0.34799565005437433,
|
|
"grad_norm": 0.0004082749364897609,
|
|
"learning_rate": 1.8975903614457832e-06,
|
|
"loss": -0.0092,
|
|
"num_tokens": 307031877.0,
|
|
"reward": 1.0682907581329346,
|
|
"reward_std": 0.11585188210010529,
|
|
"rewards/accuracy_reward": 0.7100694179534912,
|
|
"rewards/brier_reward": 0.8259410500526428,
|
|
"rewards/confidence_uniqueness_reward": 0.9193386673927307,
|
|
"rewards/format_reward": 0.9896701335906982,
|
|
"rewards/frontier_aurc_reward": -0.0016615271219052375,
|
|
"rewards/frontier_coverage_0": 0.010000471444800495,
|
|
"rewards/frontier_coverage_1": 0.010000471444800495,
|
|
"rewards/frontier_coverage_10": 0.010000471444800495,
|
|
"rewards/frontier_coverage_15": 0.010000471444800495,
|
|
"rewards/frontier_coverage_20": 0.010000471444800495,
|
|
"rewards/frontier_coverage_25": 0.01015151059255004,
|
|
"rewards/frontier_coverage_5": 0.010000471444800495,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.006473575532436371,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1404513895511627,
|
|
"signal/accuracy_reward/group_std_mean": 0.19066681563854218,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.44166667461395265,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07022569477558135,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07022569477558135,
|
|
"signal/advantage_abs_mean": 0.08215740621089936,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08215740621089936,
|
|
"signal/advantage_pre_scale_std": 0.15350556373596191,
|
|
"signal/advantage_std": 0.15350556373596191,
|
|
"signal/brier_reward/centered_abs_mean": 0.10650975555181504,
|
|
"signal/brier_reward/group_std_mean": 0.13953691720962524,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01331371944397688,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01331371944397688,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04581173062324524,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06800653263926507,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005726466327905655,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005726466327905655,
|
|
"signal/format_reward/centered_abs_mean": 0.01798502616584301,
|
|
"signal/format_reward/group_std_mean": 0.03467189371585846,
|
|
"signal/format_reward/group_zero_std_frac": 0.8555555701255798,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.008992513082921505,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.008992513082921505,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0017546760383993388,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0026522258296608923,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.7416813099989668e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.7416813099989668e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.10596181005239487,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1449078232049942,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.10596181005239487,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1449078232049942,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.10596181005239487,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1449078232049942,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.10596181005239487,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.1449078232049942,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.10596181005239487,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.1449078232049942,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.1005162313580513,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.13775036633014678,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0015705661149695515,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0015705661149695515,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.10596181005239487,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1449078232049942,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0016556532820686698,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.005957813002169132,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.011087938956916333,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0007447266252711415,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0007447266252711415,
|
|
"step": 145
|
|
},
|
|
{
|
|
"calibration/aurc": 0.18209448578532905,
|
|
"calibration/batch_distribution_entropy": 0.7402098532235222,
|
|
"calibration/batch_entropy_100bins": 0.8083018729096372,
|
|
"calibration/batch_entropy_10bins": 0.7402098532235222,
|
|
"calibration/batch_entropy_50bins": 0.8191539259469073,
|
|
"calibration/batch_uniqueness": 0.9301449071354948,
|
|
"calibration/buffer_distribution_entropy": 0.8311244629730232,
|
|
"calibration/buffer_entropy_100bins": 0.8587688993364191,
|
|
"calibration/buffer_entropy_10bins": 0.8311244629730232,
|
|
"calibration/buffer_entropy_50bins": 0.8739160708252068,
|
|
"calibration/confidence_entropy": 0.5100700509903451,
|
|
"calibration/coverage@0%": 0.004201001640614507,
|
|
"calibration/coverage@1%": 0.004201001640614507,
|
|
"calibration/coverage@10%": 0.34029064401297937,
|
|
"calibration/coverage@15%": 0.47469158826861174,
|
|
"calibration/coverage@20%": 0.5657170493820617,
|
|
"calibration/coverage@25%": 0.7275975167531156,
|
|
"calibration/coverage@30%": 0.8418513621822916,
|
|
"calibration/coverage@5%": 0.19152198006106938,
|
|
"calibration/distribution_entropy_10": 0.7402098532235222,
|
|
"calibration/distribution_entropy_100": 0.8083018729096372,
|
|
"calibration/ece": 0.12855990872542672,
|
|
"calibration/mean_confidence": 0.6749319072008179,
|
|
"calibration/unique_confidence_per_question": 0.19739583333333333,
|
|
"calibration/unique_confidences": 75.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.009635416666666674,
|
|
"completions/max_length": 3069.4,
|
|
"completions/max_terminated_length": 3069.4,
|
|
"completions/mean_length": 702.6961791992187,
|
|
"completions/mean_terminated_length": 709.4884643554688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 192.8,
|
|
"epoch": 0.3599955000562493,
|
|
"grad_norm": 0.0004130478191655129,
|
|
"learning_rate": 1.7469879518072292e-06,
|
|
"loss": -0.0071,
|
|
"num_tokens": 318237273.0,
|
|
"reward": 1.0614855289459229,
|
|
"reward_std": 0.1201270878314972,
|
|
"rewards/accuracy_reward": 0.6957465291023255,
|
|
"rewards/brier_reward": 0.8208768486976623,
|
|
"rewards/confidence_uniqueness_reward": 0.9183289051055908,
|
|
"rewards/format_reward": 0.9903645873069763,
|
|
"rewards/frontier_aurc_reward": -0.0020163535373285413,
|
|
"rewards/frontier_coverage_0": 0.013487431593239308,
|
|
"rewards/frontier_coverage_1": 0.013487431593239308,
|
|
"rewards/frontier_coverage_10": 0.013487431593239308,
|
|
"rewards/frontier_coverage_15": 0.013487431593239308,
|
|
"rewards/frontier_coverage_20": 0.012901889439672232,
|
|
"rewards/frontier_coverage_25": 0.020031385496258734,
|
|
"rewards/frontier_coverage_5": 0.013487431593239308,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0040603259578347204,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.15117729753255843,
|
|
"signal/accuracy_reward/group_std_mean": 0.20031063556671141,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4277777850627899,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07558864876627922,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07558864876627922,
|
|
"signal/advantage_abs_mean": 0.08776713460683823,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08776713460683823,
|
|
"signal/advantage_pre_scale_std": 0.1591554254293442,
|
|
"signal/advantage_std": 0.1591554254293442,
|
|
"signal/brier_reward/centered_abs_mean": 0.10871631652116776,
|
|
"signal/brier_reward/group_std_mean": 0.14245359599590302,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01358953956514597,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01358953956514597,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.042987743765115737,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.062032976746559144,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005373467970639467,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005373467970639467,
|
|
"signal/format_reward/centered_abs_mean": 0.015554470103234052,
|
|
"signal/format_reward/group_std_mean": 0.029216957837343217,
|
|
"signal/format_reward/group_zero_std_frac": 0.8805555582046509,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007777235051617026,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.007777235051617026,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0022927817422896623,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.003552949335426092,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.582471472327597e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.582471472327597e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.09796600192785263,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.13663864582777024,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.09796600192785263,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.13663864582777024,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.09796600192785263,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.13663864582777024,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09796600192785263,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.13663864582777024,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.09587855786085128,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.13392478972673416,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0014981024665758013,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0014981024665758013,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.036509061604738234,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.0526451326906681,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0005704540875740349,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0005704540875740349,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.09796600192785263,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.13663864582777024,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0015307187801226973,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0036625199019908903,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.006539558339864015,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.002777777798473835,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0004578149877488613,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0004578149877488613,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.3599955000562493,
|
|
"eval_completions/clipped_ratio": 0.011284722222222229,
|
|
"eval_completions/max_length": 2426.8333333333335,
|
|
"eval_completions/max_terminated_length": 2426.8333333333335,
|
|
"eval_completions/mean_length": 678.6144205729166,
|
|
"eval_completions/mean_terminated_length": 686.3630065917969,
|
|
"eval_completions/min_length": 51.5,
|
|
"eval_completions/min_terminated_length": 242.16666666666666,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 318237273.0,
|
|
"eval_reward": 1.037120411793391,
|
|
"eval_reward_std": 0.26037078599135083,
|
|
"eval_rewards/accuracy_reward": 0.6710069278875986,
|
|
"eval_rewards/brier_reward": 0.7987116674582163,
|
|
"eval_rewards/confidence_uniqueness_reward": 0.864409069220225,
|
|
"eval_rewards/format_reward": 0.9861111144224802,
|
|
"eval_rewards/frontier_aurc_reward": -0.0020773761401263378,
|
|
"eval_rewards/frontier_coverage_0": 0.008917404959599176,
|
|
"eval_rewards/frontier_coverage_1": 0.008917404959599176,
|
|
"eval_rewards/frontier_coverage_10": 0.008917404959599176,
|
|
"eval_rewards/frontier_coverage_15": 0.008917404959599176,
|
|
"eval_rewards/frontier_coverage_20": 0.00835825433023274,
|
|
"eval_rewards/frontier_coverage_25": 0.019696833721051615,
|
|
"eval_rewards/frontier_coverage_5": 0.008917404959599176,
|
|
"eval_rewards/true_frontier_ece_gap_only_reward": -0.003450475827169915,
|
|
"eval_runtime": 210.1124,
|
|
"eval_samples_per_second": 4.759,
|
|
"eval_signal/accuracy_reward/centered_abs_mean": 0.4268120676279068,
|
|
"eval_signal/accuracy_reward/group_std_mean": 0.46841634809970856,
|
|
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.2134060338139534,
|
|
"eval_signal/accuracy_reward/weight": 0.5,
|
|
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2134060338139534,
|
|
"eval_signal/advantage_abs_mean": 0.22498015811045965,
|
|
"eval_signal/advantage_pre_scale_abs_mean": 0.22498015811045965,
|
|
"eval_signal/advantage_pre_scale_std": 0.2599627524614334,
|
|
"eval_signal/advantage_std": 0.2599627524614334,
|
|
"eval_signal/brier_reward/centered_abs_mean": 0.19082651287317276,
|
|
"eval_signal/brier_reward/group_std_mean": 0.23933010548353195,
|
|
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.023853314109146595,
|
|
"eval_signal/brier_reward/weight": 0.125,
|
|
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.023853314109146595,
|
|
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.07184251459936301,
|
|
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.10839165622989337,
|
|
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008980314324920377,
|
|
"eval_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008980314324920377,
|
|
"eval_signal/format_reward/centered_abs_mean": 0.026475694806625445,
|
|
"eval_signal/format_reward/group_std_mean": 0.06660978465030591,
|
|
"eval_signal/format_reward/group_zero_std_frac": 0.6666666865348816,
|
|
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.013237847403312722,
|
|
"eval_signal/format_reward/weight": 0.5,
|
|
"eval_signal/format_reward/weighted_centered_abs_mean": 0.013237847403312722,
|
|
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.0033070850962152085,
|
|
"eval_signal/frontier_aurc_reward/group_std_mean": 0.006110090451935927,
|
|
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 5.167320462836263e-05,
|
|
"eval_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 5.167320462836263e-05,
|
|
"eval_signal/frontier_coverage_0/centered_abs_mean": 0.1586559092005094,
|
|
"eval_signal/frontier_coverage_0/group_std_mean": 0.24918479472398758,
|
|
"eval_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_0/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_1/centered_abs_mean": 0.1586559092005094,
|
|
"eval_signal/frontier_coverage_1/group_std_mean": 0.24918479472398758,
|
|
"eval_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_1/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_10/centered_abs_mean": 0.1586559092005094,
|
|
"eval_signal/frontier_coverage_10/group_std_mean": 0.24918479472398758,
|
|
"eval_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_10/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_15/centered_abs_mean": 0.1586559092005094,
|
|
"eval_signal/frontier_coverage_15/group_std_mean": 0.24918479472398758,
|
|
"eval_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_15/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_20/centered_abs_mean": 0.12169866388042767,
|
|
"eval_signal/frontier_coverage_20/group_std_mean": 0.19936797271172205,
|
|
"eval_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0019015416231316824,
|
|
"eval_signal/frontier_coverage_20/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0019015416231316824,
|
|
"eval_signal/frontier_coverage_25/centered_abs_mean": 0.04645684982339541,
|
|
"eval_signal/frontier_coverage_25/group_std_mean": 0.07699030389388402,
|
|
"eval_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0007258882784905533,
|
|
"eval_signal/frontier_coverage_25/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0007258882784905533,
|
|
"eval_signal/frontier_coverage_5/centered_abs_mean": 0.1586559092005094,
|
|
"eval_signal/frontier_coverage_5/group_std_mean": 0.24918479472398758,
|
|
"eval_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/frontier_coverage_5/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0024789985812579594,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0035569225437939167,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.005802453495562077,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0004446153179742396,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0004446153179742396,
|
|
"eval_steps_per_second": 0.029,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.3599955000562493,
|
|
"step": 150,
|
|
"train_probe_completions/clipped_ratio": 0.009375000000000003,
|
|
"train_probe_completions/max_length": 2373.3333333333335,
|
|
"train_probe_completions/max_terminated_length": 2373.3333333333335,
|
|
"train_probe_completions/mean_length": 680.1991678873698,
|
|
"train_probe_completions/mean_terminated_length": 686.6373596191406,
|
|
"train_probe_completions/min_length": 70.66666666666667,
|
|
"train_probe_completions/min_terminated_length": 212.5,
|
|
"train_probe_loss": 0.0,
|
|
"train_probe_num_tokens": 318237273.0,
|
|
"train_probe_reward": 1.055055598417918,
|
|
"train_probe_reward_std": 0.24644359946250916,
|
|
"train_probe_rewards/accuracy_reward": 0.6944444477558136,
|
|
"train_probe_rewards/brier_reward": 0.8187563320000967,
|
|
"train_probe_rewards/confidence_uniqueness_reward": 0.873372862736384,
|
|
"train_probe_rewards/format_reward": 0.9904513855775198,
|
|
"train_probe_rewards/frontier_aurc_reward": -0.001900765870232135,
|
|
"train_probe_rewards/frontier_coverage_0": 0.012389092764351517,
|
|
"train_probe_rewards/frontier_coverage_1": 0.012389092764351517,
|
|
"train_probe_rewards/frontier_coverage_10": 0.012389092764351517,
|
|
"train_probe_rewards/frontier_coverage_15": 0.012389092764351517,
|
|
"train_probe_rewards/frontier_coverage_20": 0.014410387520911172,
|
|
"train_probe_rewards/frontier_coverage_25": 0.024321939796209335,
|
|
"train_probe_rewards/frontier_coverage_5": 0.012389092764351517,
|
|
"train_probe_rewards/true_frontier_ece_gap_only_reward": -0.0036154407619809112,
|
|
"train_probe_runtime": 188.9198,
|
|
"train_probe_samples_per_second": 5.293,
|
|
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.4124348958333333,
|
|
"train_probe_signal/accuracy_reward/group_std_mean": 0.45987477401892346,
|
|
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.20621744791666666,
|
|
"train_probe_signal/accuracy_reward/weight": 0.5,
|
|
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.20621744791666666,
|
|
"train_probe_signal/advantage_abs_mean": 0.21144999066988626,
|
|
"train_probe_signal/advantage_pre_scale_abs_mean": 0.21144999066988626,
|
|
"train_probe_signal/advantage_pre_scale_std": 0.24553329994281134,
|
|
"train_probe_signal/advantage_std": 0.24553329994281134,
|
|
"train_probe_signal/brier_reward/centered_abs_mean": 0.17376654346783957,
|
|
"train_probe_signal/brier_reward/group_std_mean": 0.2235363299647967,
|
|
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.021720817933479946,
|
|
"train_probe_signal/brier_reward/weight": 0.125,
|
|
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.021720817933479946,
|
|
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.060595336059729256,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.09015070833265781,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.007574417007466157,
|
|
"train_probe_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007574417007466157,
|
|
"train_probe_signal/format_reward/centered_abs_mean": 0.018283420087148745,
|
|
"train_probe_signal/format_reward/group_std_mean": 0.04803628505518039,
|
|
"train_probe_signal/format_reward/group_zero_std_frac": 0.7500000149011612,
|
|
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.009141710043574372,
|
|
"train_probe_signal/format_reward/weight": 0.5,
|
|
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.009141710043574372,
|
|
"train_probe_signal/frontier_aurc_reward/centered_abs_mean": 0.003131849652466675,
|
|
"train_probe_signal/frontier_aurc_reward/group_std_mean": 0.0059242877177894115,
|
|
"train_probe_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.8935150819791794e-05,
|
|
"train_probe_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"train_probe_signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.8935150819791794e-05,
|
|
"train_probe_signal/frontier_coverage_0/centered_abs_mean": 0.16411924362182617,
|
|
"train_probe_signal/frontier_coverage_0/group_std_mean": 0.25143779317537945,
|
|
"train_probe_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_0/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_1/centered_abs_mean": 0.16411924362182617,
|
|
"train_probe_signal/frontier_coverage_1/group_std_mean": 0.25143779317537945,
|
|
"train_probe_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_1/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_10/centered_abs_mean": 0.16411924362182617,
|
|
"train_probe_signal/frontier_coverage_10/group_std_mean": 0.25143779317537945,
|
|
"train_probe_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_10/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_15/centered_abs_mean": 0.16411924362182617,
|
|
"train_probe_signal/frontier_coverage_15/group_std_mean": 0.25143779317537945,
|
|
"train_probe_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_15/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_20/centered_abs_mean": 0.11266574015220006,
|
|
"train_probe_signal/frontier_coverage_20/group_std_mean": 0.17991459121306738,
|
|
"train_probe_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.001760402189878126,
|
|
"train_probe_signal/frontier_coverage_20/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.001760402189878126,
|
|
"train_probe_signal/frontier_coverage_25/centered_abs_mean": 0.04643759255607923,
|
|
"train_probe_signal/frontier_coverage_25/group_std_mean": 0.07232892637451489,
|
|
"train_probe_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.000725587383688738,
|
|
"train_probe_signal/frontier_coverage_25/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.000725587383688738,
|
|
"train_probe_signal/frontier_coverage_5/centered_abs_mean": 0.16411924362182617,
|
|
"train_probe_signal/frontier_coverage_5/group_std_mean": 0.25143779317537945,
|
|
"train_probe_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/frontier_coverage_5/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.002564363181591034,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0037723184019948044,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.00622099117996792,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00047153980024935055,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00047153980024935055,
|
|
"train_probe_steps_per_second": 0.032
|
|
},
|
|
{
|
|
"calibration/aurc": 0.13391643198760078,
|
|
"calibration/batch_distribution_entropy": 0.7948606211172603,
|
|
"calibration/batch_entropy_100bins": 0.8368200343599981,
|
|
"calibration/batch_entropy_10bins": 0.7948606211172603,
|
|
"calibration/batch_entropy_50bins": 0.8510265931141483,
|
|
"calibration/batch_uniqueness": 0.944000865460584,
|
|
"calibration/buffer_distribution_entropy": 0.8213485685365185,
|
|
"calibration/buffer_entropy_100bins": 0.8620536929835996,
|
|
"calibration/buffer_entropy_10bins": 0.8213485685365185,
|
|
"calibration/buffer_entropy_50bins": 0.8727871263863621,
|
|
"calibration/confidence_entropy": 0.5393953875867862,
|
|
"calibration/coverage@0%": 0.032740440088172104,
|
|
"calibration/coverage@1%": 0.032740440088172104,
|
|
"calibration/coverage@10%": 0.3944664512557533,
|
|
"calibration/coverage@15%": 0.6908698207865707,
|
|
"calibration/coverage@20%": 0.7854961519008106,
|
|
"calibration/coverage@25%": 0.8895916777165667,
|
|
"calibration/coverage@30%": 0.9335078534031414,
|
|
"calibration/coverage@5%": 0.24789239883630465,
|
|
"calibration/distribution_entropy_10": 0.7948606211172603,
|
|
"calibration/distribution_entropy_100": 0.8368200343599981,
|
|
"calibration/ece": 0.1253224171661008,
|
|
"calibration/mean_confidence": 0.6584433504347348,
|
|
"calibration/unique_confidence_per_question": 0.196875,
|
|
"calibration/unique_confidences": 75.6,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.008767361111111116,
|
|
"completions/max_length": 3146.6,
|
|
"completions/max_terminated_length": 3146.6,
|
|
"completions/mean_length": 669.9478393554688,
|
|
"completions/mean_terminated_length": 675.8888916015625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 214.6,
|
|
"epoch": 0.3719953500581243,
|
|
"grad_norm": 0.0004369099042378366,
|
|
"learning_rate": 1.5963855421686747e-06,
|
|
"loss": -0.0074,
|
|
"num_tokens": 329062784.0,
|
|
"reward": 1.0875563144683837,
|
|
"reward_std": 0.1229211449623108,
|
|
"rewards/accuracy_reward": 0.7425347328186035,
|
|
"rewards/brier_reward": 0.835495126247406,
|
|
"rewards/confidence_uniqueness_reward": 0.9289066553115845,
|
|
"rewards/format_reward": 0.9911458492279053,
|
|
"rewards/frontier_aurc_reward": -0.0014140044804662466,
|
|
"rewards/frontier_coverage_0": -0.001173873944208026,
|
|
"rewards/frontier_coverage_1": -0.001173873944208026,
|
|
"rewards/frontier_coverage_10": -0.001173873944208026,
|
|
"rewards/frontier_coverage_15": -0.001173873944208026,
|
|
"rewards/frontier_coverage_20": 0.01185264540836215,
|
|
"rewards/frontier_coverage_25": 0.028303157165646554,
|
|
"rewards/frontier_coverage_5": -0.001173873944208026,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0027825822588056328,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16309678852558135,
|
|
"signal/accuracy_reward/group_std_mean": 0.21074391305446624,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4111111104488373,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08154839426279067,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08154839426279067,
|
|
"signal/advantage_abs_mean": 0.09126082807779312,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09126082807779312,
|
|
"signal/advantage_pre_scale_std": 0.16173238456249237,
|
|
"signal/advantage_std": 0.16173238456249237,
|
|
"signal/brier_reward/centered_abs_mean": 0.10318089425563812,
|
|
"signal/brier_reward/group_std_mean": 0.13717953413724898,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.012897611781954765,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.012897611781954765,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03800181671977043,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05584709048271179,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004750227089971304,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004750227089971304,
|
|
"signal/format_reward/centered_abs_mean": 0.014941406436264515,
|
|
"signal/format_reward/group_std_mean": 0.02782573737204075,
|
|
"signal/format_reward/group_zero_std_frac": 0.8861111164093017,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0074707032181322575,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.0074707032181322575,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0018860452808439731,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0032281734980642795,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.946945751318708e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.946945751318708e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.10934300273656845,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.15232057571411134,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.10934300273656845,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.15232057571411134,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.10934300273656845,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.15232057571411134,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.10934300273656845,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.15232057571411134,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.055034608393907544,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.08059312552213668,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0008599157561548054,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0008599157561548054,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.033925560861825944,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.04722090288996696,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0005300868884660304,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0005300868884660304,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.10934300273656845,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.15232057571411134,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001708484417758882,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.002597217308357358,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.0038485261145979168,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00032465216354466976,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00032465216354466976,
|
|
"step": 155
|
|
},
|
|
{
|
|
"calibration/aurc": 0.15233241534879957,
|
|
"calibration/batch_distribution_entropy": 0.7053979650865262,
|
|
"calibration/batch_entropy_100bins": 0.7897403240858474,
|
|
"calibration/batch_entropy_10bins": 0.7053979650865262,
|
|
"calibration/batch_entropy_50bins": 0.7958750116338504,
|
|
"calibration/batch_uniqueness": 0.9272095434772781,
|
|
"calibration/buffer_distribution_entropy": 0.8133656085702008,
|
|
"calibration/buffer_entropy_100bins": 0.8631911273929548,
|
|
"calibration/buffer_entropy_10bins": 0.8133656085702008,
|
|
"calibration/buffer_entropy_50bins": 0.8712631524302715,
|
|
"calibration/confidence_entropy": 0.5216451024089949,
|
|
"calibration/coverage@0%": 0.12002745244582987,
|
|
"calibration/coverage@1%": 0.20824453289395234,
|
|
"calibration/coverage@10%": 0.45039907142423363,
|
|
"calibration/coverage@15%": 0.735983934467869,
|
|
"calibration/coverage@20%": 0.773753280839895,
|
|
"calibration/coverage@25%": 0.7937007874015748,
|
|
"calibration/coverage@30%": 0.8,
|
|
"calibration/coverage@5%": 0.27139264976610233,
|
|
"calibration/distribution_entropy_10": 0.7053979650865262,
|
|
"calibration/distribution_entropy_100": 0.7897403240858474,
|
|
"calibration/ece": 0.13596159831377114,
|
|
"calibration/mean_confidence": 0.7032028452021251,
|
|
"calibration/unique_confidence_per_question": 0.17864583333333334,
|
|
"calibration/unique_confidences": 68.6,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.011979166666666674,
|
|
"completions/max_length": 3056.6,
|
|
"completions/max_terminated_length": 3056.6,
|
|
"completions/mean_length": 679.8614624023437,
|
|
"completions/mean_terminated_length": 688.1098266601563,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 200.0,
|
|
"epoch": 0.38399520005999926,
|
|
"grad_norm": 0.0003775613440666348,
|
|
"learning_rate": 1.4457831325301204e-06,
|
|
"loss": -0.0102,
|
|
"num_tokens": 339982084.0,
|
|
"reward": 1.0505271673202514,
|
|
"reward_std": 0.1207915186882019,
|
|
"rewards/accuracy_reward": 0.677256953716278,
|
|
"rewards/brier_reward": 0.8067374706268311,
|
|
"rewards/confidence_uniqueness_reward": 0.9252258539199829,
|
|
"rewards/format_reward": 0.9880208373069763,
|
|
"rewards/frontier_aurc_reward": -0.0025012485682964327,
|
|
"rewards/frontier_coverage_0": 0.012539402535185217,
|
|
"rewards/frontier_coverage_1": 0.012539402535185217,
|
|
"rewards/frontier_coverage_10": 0.012539402535185217,
|
|
"rewards/frontier_coverage_15": 0.013295956503134221,
|
|
"rewards/frontier_coverage_20": 0.016371296532452107,
|
|
"rewards/frontier_coverage_25": 0.02971927933394909,
|
|
"rewards/frontier_coverage_5": 0.012539402535185217,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0022376260720193388,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1533745676279068,
|
|
"signal/accuracy_reward/group_std_mean": 0.1958913177251816,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4666666746139526,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0766872838139534,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0766872838139534,
|
|
"signal/advantage_abs_mean": 0.09020575135946274,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09020575135946274,
|
|
"signal/advantage_pre_scale_std": 0.16270052194595336,
|
|
"signal/advantage_std": 0.16270052194595336,
|
|
"signal/brier_reward/centered_abs_mean": 0.10894776731729508,
|
|
"signal/brier_reward/group_std_mean": 0.1400896966457367,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013618470914661885,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013618470914661885,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04274830222129822,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06329518854618073,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005343537777662277,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005343537777662277,
|
|
"signal/format_reward/centered_abs_mean": 0.019791666604578494,
|
|
"signal/format_reward/group_std_mean": 0.03536950312554836,
|
|
"signal/format_reward/group_zero_std_frac": 0.8611111164093017,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.009895833302289247,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.009895833302289247,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0028245057445019485,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.004615729767829179,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.4132902257842946e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.4132902257842946e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.11053884625434876,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1505269557237625,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.11053884625434876,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1505269557237625,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.11053884625434876,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1505269557237625,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09622148275375367,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.13204507827758788,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001503460668027401,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001503460668027401,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04334339499473572,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.06285871043801308,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0006772405467927456,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0006772405467927456,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.036124877631664276,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.04845571741461754,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0005644512129947543,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0005644512129947543,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.11053884625434876,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1505269557237625,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0017271694727241994,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0021982237696647642,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.00320082139223814,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00027477797120809553,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00027477797120809553,
|
|
"step": 160
|
|
},
|
|
{
|
|
"calibration/aurc": 0.18058905327825436,
|
|
"calibration/batch_distribution_entropy": 0.7767610930080233,
|
|
"calibration/batch_entropy_100bins": 0.828420695410555,
|
|
"calibration/batch_entropy_10bins": 0.7767610930080233,
|
|
"calibration/batch_entropy_50bins": 0.8396945215174878,
|
|
"calibration/batch_uniqueness": 0.9377851813118999,
|
|
"calibration/buffer_distribution_entropy": 0.8137825525481095,
|
|
"calibration/buffer_entropy_100bins": 0.8667738672543628,
|
|
"calibration/buffer_entropy_10bins": 0.8137825525481095,
|
|
"calibration/buffer_entropy_50bins": 0.873905604077686,
|
|
"calibration/confidence_entropy": 0.5159103697988725,
|
|
"calibration/coverage@0%": 0.0042729555744199215,
|
|
"calibration/coverage@1%": 0.0042729555744199215,
|
|
"calibration/coverage@10%": 0.41248231465761176,
|
|
"calibration/coverage@15%": 0.4753775820599887,
|
|
"calibration/coverage@20%": 0.532258064516129,
|
|
"calibration/coverage@25%": 0.7172935454271994,
|
|
"calibration/coverage@30%": 0.8444380428056248,
|
|
"calibration/coverage@5%": 0.21247736276174306,
|
|
"calibration/distribution_entropy_10": 0.7767610930080233,
|
|
"calibration/distribution_entropy_100": 0.828420695410555,
|
|
"calibration/ece": 0.12461592848485974,
|
|
"calibration/mean_confidence": 0.6432186546021172,
|
|
"calibration/unique_confidence_per_question": 0.20104166666666665,
|
|
"calibration/unique_confidences": 77.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01362847222222221,
|
|
"completions/max_length": 3466.0,
|
|
"completions/max_terminated_length": 3466.0,
|
|
"completions/mean_length": 714.0564208984375,
|
|
"completions/mean_terminated_length": 724.0512939453125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 200.6,
|
|
"epoch": 0.39599505006187424,
|
|
"grad_norm": 0.0003885479236487299,
|
|
"learning_rate": 1.2951807228915664e-06,
|
|
"loss": -0.0108,
|
|
"num_tokens": 351347086.0,
|
|
"reward": 1.0483879327774048,
|
|
"reward_std": 0.12390840351581574,
|
|
"rewards/accuracy_reward": 0.6697048664093017,
|
|
"rewards/brier_reward": 0.8130905270576477,
|
|
"rewards/confidence_uniqueness_reward": 0.9271872401237488,
|
|
"rewards/format_reward": 0.9863715410232544,
|
|
"rewards/frontier_aurc_reward": -0.002153858123347163,
|
|
"rewards/frontier_coverage_0": 0.028279137797653675,
|
|
"rewards/frontier_coverage_1": 0.028279137797653675,
|
|
"rewards/frontier_coverage_10": 0.028279137797653675,
|
|
"rewards/frontier_coverage_15": 0.028847700357437132,
|
|
"rewards/frontier_coverage_20": 0.02469187043607235,
|
|
"rewards/frontier_coverage_25": 0.03973658010363579,
|
|
"rewards/frontier_coverage_5": 0.028279137797653675,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0030099464114755393,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.14902886599302292,
|
|
"signal/accuracy_reward/group_std_mean": 0.19532329142093657,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.44166667461395265,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07451443299651146,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07451443299651146,
|
|
"signal/advantage_abs_mean": 0.09079683572053909,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09079683572053909,
|
|
"signal/advantage_pre_scale_std": 0.16342334747314452,
|
|
"signal/advantage_std": 0.16342334747314452,
|
|
"signal/brier_reward/centered_abs_mean": 0.11507419794797898,
|
|
"signal/brier_reward/group_std_mean": 0.15091157853603362,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014384274743497372,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014384274743497372,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04255444556474686,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06380771696567536,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005319305695593357,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005319305695593357,
|
|
"signal/format_reward/centered_abs_mean": 0.022292751632630826,
|
|
"signal/format_reward/group_std_mean": 0.03928981348872185,
|
|
"signal/format_reward/group_zero_std_frac": 0.8472222208976745,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.011146375816315413,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.011146375816315413,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0026308648753911256,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0045830888208001856,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.110726367798634e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.110726367798634e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.12479231059551239,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.16951032280921935,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.12479231059551239,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.16951032280921935,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12479231059551239,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.16951032280921935,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09972788542509078,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.13676573038101197,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0015582482097670435,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0015582482097670435,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.05088546723127365,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.07109279409050942,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007950854254886508,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007950854254886508,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.04205540716648102,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.05501595437526703,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0006571157369762659,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0006571157369762659,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.12479231059551239,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.16951032280921935,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0019498798530548811,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0027308772783726453,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.0037563166581094263,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00034135965979658066,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00034135965979658066,
|
|
"step": 165
|
|
},
|
|
{
|
|
"calibration/aurc": 0.14844663172603637,
|
|
"calibration/batch_distribution_entropy": 0.6995723211429985,
|
|
"calibration/batch_entropy_100bins": 0.7860955102263528,
|
|
"calibration/batch_entropy_10bins": 0.6995723211429985,
|
|
"calibration/batch_entropy_50bins": 0.7898930908087898,
|
|
"calibration/batch_uniqueness": 0.9239117880487016,
|
|
"calibration/buffer_distribution_entropy": 0.8153715309967344,
|
|
"calibration/buffer_entropy_100bins": 0.8700450976408975,
|
|
"calibration/buffer_entropy_10bins": 0.8153715309967344,
|
|
"calibration/buffer_entropy_50bins": 0.8761398327730963,
|
|
"calibration/confidence_entropy": 0.511929929827141,
|
|
"calibration/coverage@0%": 0.014253398536593877,
|
|
"calibration/coverage@1%": 0.014253398536593877,
|
|
"calibration/coverage@10%": 0.3627433108172956,
|
|
"calibration/coverage@15%": 0.650761552543238,
|
|
"calibration/coverage@20%": 0.7580184572251633,
|
|
"calibration/coverage@25%": 0.9351347617666155,
|
|
"calibration/coverage@30%": 0.9687830687830689,
|
|
"calibration/coverage@5%": 0.014253398536593877,
|
|
"calibration/distribution_entropy_10": 0.6995723211429985,
|
|
"calibration/distribution_entropy_100": 0.7860955102263528,
|
|
"calibration/ece": 0.09246575064390247,
|
|
"calibration/mean_confidence": 0.7047365129909607,
|
|
"calibration/unique_confidence_per_question": 0.1796875,
|
|
"calibration/unique_confidences": 69.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.009809027777777767,
|
|
"completions/max_length": 3302.6,
|
|
"completions/max_terminated_length": 3302.6,
|
|
"completions/mean_length": 686.3215209960938,
|
|
"completions/mean_terminated_length": 693.1324462890625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 190.6,
|
|
"epoch": 0.4079949000637492,
|
|
"grad_norm": 0.0003591932763811201,
|
|
"learning_rate": 1.1445783132530121e-06,
|
|
"loss": -0.0087,
|
|
"num_tokens": 362342694.0,
|
|
"reward": 1.0759626150131225,
|
|
"reward_std": 0.12116758078336716,
|
|
"rewards/accuracy_reward": 0.7206597208976746,
|
|
"rewards/brier_reward": 0.8285124659538269,
|
|
"rewards/confidence_uniqueness_reward": 0.9224372029304504,
|
|
"rewards/format_reward": 0.9901909828186035,
|
|
"rewards/frontier_aurc_reward": -0.002220888831652701,
|
|
"rewards/frontier_coverage_0": 0.010648279171437024,
|
|
"rewards/frontier_coverage_1": 0.010648279171437024,
|
|
"rewards/frontier_coverage_10": 0.010648279171437024,
|
|
"rewards/frontier_coverage_15": 0.015687369927763938,
|
|
"rewards/frontier_coverage_20": 0.02136296220123768,
|
|
"rewards/frontier_coverage_25": 0.0481999009847641,
|
|
"rewards/frontier_coverage_5": 0.010648279171437024,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.002354492200538516,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.14618055820465087,
|
|
"signal/accuracy_reward/group_std_mean": 0.19709926843643188,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.42777777314186094,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07309027910232543,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07309027910232543,
|
|
"signal/advantage_abs_mean": 0.08560824990272523,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08560824990272523,
|
|
"signal/advantage_pre_scale_std": 0.1597005158662796,
|
|
"signal/advantage_std": 0.1597005158662796,
|
|
"signal/brier_reward/centered_abs_mean": 0.10978586375713348,
|
|
"signal/brier_reward/group_std_mean": 0.14447366297245026,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013723232969641685,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013723232969641685,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04156898036599159,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06315687522292138,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005196122545748949,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005196122545748949,
|
|
"signal/format_reward/centered_abs_mean": 0.017355685867369176,
|
|
"signal/format_reward/group_std_mean": 0.03438038341701031,
|
|
"signal/format_reward/group_zero_std_frac": 0.8527777791023254,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.008677842933684588,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.008677842933684588,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.002847391273826361,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.004856492578983307,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.449048865353689e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.449048865353689e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.10908669680356979,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.1511477291584015,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.10908669680356979,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.1511477291584015,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.10908669680356979,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1511477291584015,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.08291138708591461,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.11675856113433838,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0012954904232174158,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0012954904232174158,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.0416046604514122,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.059697122871875764,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0006500728195533156,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0006500728195533156,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.041713655740022657,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.05425269529223442,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.000651775870937854,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.000651775870937854,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.10908669680356979,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.1511477291584015,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001704479637555778,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0024256525095552204,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.0035729790572077034,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00030320656369440255,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00030320656369440255,
|
|
"step": 170
|
|
},
|
|
{
|
|
"calibration/aurc": 0.14461688721979393,
|
|
"calibration/batch_distribution_entropy": 0.7800736712882704,
|
|
"calibration/batch_entropy_100bins": 0.8300042856724333,
|
|
"calibration/batch_entropy_10bins": 0.7800736712882704,
|
|
"calibration/batch_entropy_50bins": 0.8404826975612186,
|
|
"calibration/batch_uniqueness": 0.938196954829919,
|
|
"calibration/buffer_distribution_entropy": 0.8152604036836385,
|
|
"calibration/buffer_entropy_100bins": 0.8717666012501055,
|
|
"calibration/buffer_entropy_10bins": 0.8152604036836385,
|
|
"calibration/buffer_entropy_50bins": 0.8767388825784943,
|
|
"calibration/confidence_entropy": 0.5208488747591534,
|
|
"calibration/coverage@0%": 0.012662042165713209,
|
|
"calibration/coverage@1%": 0.012662042165713209,
|
|
"calibration/coverage@10%": 0.3192511394437657,
|
|
"calibration/coverage@15%": 0.5831173985414801,
|
|
"calibration/coverage@20%": 0.8100232338874823,
|
|
"calibration/coverage@25%": 0.9231852982671059,
|
|
"calibration/coverage@30%": 0.9916449086161879,
|
|
"calibration/coverage@5%": 0.1479950273702389,
|
|
"calibration/distribution_entropy_10": 0.7800736712882704,
|
|
"calibration/distribution_entropy_100": 0.8300042856724333,
|
|
"calibration/ece": 0.0991099850213959,
|
|
"calibration/mean_confidence": 0.6590511411080155,
|
|
"calibration/unique_confidence_per_question": 0.20572916666666666,
|
|
"calibration/unique_confidences": 79.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.011197916666666674,
|
|
"completions/max_length": 3595.2,
|
|
"completions/max_terminated_length": 3595.2,
|
|
"completions/mean_length": 714.6521850585938,
|
|
"completions/mean_terminated_length": 722.7832885742188,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 208.0,
|
|
"epoch": 0.4199947500656242,
|
|
"grad_norm": 0.0004064071399625391,
|
|
"learning_rate": 9.93975903614458e-07,
|
|
"loss": -0.0108,
|
|
"num_tokens": 373683455.0,
|
|
"reward": 1.0710617542266845,
|
|
"reward_std": 0.12647038847208023,
|
|
"rewards/accuracy_reward": 0.7127604246139526,
|
|
"rewards/brier_reward": 0.823624587059021,
|
|
"rewards/confidence_uniqueness_reward": 0.9248634934425354,
|
|
"rewards/format_reward": 0.9888020873069763,
|
|
"rewards/frontier_aurc_reward": -0.0020120171364396812,
|
|
"rewards/frontier_coverage_0": 0.01165504176169634,
|
|
"rewards/frontier_coverage_1": 0.01165504176169634,
|
|
"rewards/frontier_coverage_10": 0.01165504176169634,
|
|
"rewards/frontier_coverage_15": 0.015564435138367116,
|
|
"rewards/frontier_coverage_20": 0.021842183917760848,
|
|
"rewards/frontier_coverage_25": 0.051497886329889296,
|
|
"rewards/frontier_coverage_5": 0.01165504176169634,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0029332443606108426,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.15782877802848816,
|
|
"signal/accuracy_reward/group_std_mean": 0.21180022656917571,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.39166666865348815,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07891438901424408,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07891438901424408,
|
|
"signal/advantage_abs_mean": 0.0904485046863556,
|
|
"signal/advantage_pre_scale_abs_mean": 0.0904485046863556,
|
|
"signal/advantage_pre_scale_std": 0.16463069915771483,
|
|
"signal/advantage_std": 0.16463069915771483,
|
|
"signal/brier_reward/centered_abs_mean": 0.11573301851749421,
|
|
"signal/brier_reward/group_std_mean": 0.1542545437812805,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014466627314686776,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014466627314686776,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.042830513417720796,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06403593942523003,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0053538141772150995,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0053538141772150995,
|
|
"signal/format_reward/centered_abs_mean": 0.01941731758415699,
|
|
"signal/format_reward/group_std_mean": 0.03575590215623379,
|
|
"signal/format_reward/group_zero_std_frac": 0.8583333373069764,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.009708658792078495,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.009708658792078495,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0027021993417292835,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.00471522705629468,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.2221864714520055e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.2221864714520055e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.12408257275819778,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.173751300573349,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.12408257275819778,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.173751300573349,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12408257275819778,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.173751300573349,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.09372715502977372,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.13314026296138765,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0014644867973402143,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0014644867973402143,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04657027423381806,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.0670913964509964,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007276605349034071,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007276605349034071,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.04459658488631248,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.05846061035990715,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0006968216388486326,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0006968216388486326,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.12408257275819778,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.173751300573349,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0019387901993468404,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0028843230567872523,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.0041553780902177095,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00036054038209840653,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00036054038209840653,
|
|
"step": 175
|
|
},
|
|
{
|
|
"calibration/aurc": 0.10250645380777204,
|
|
"calibration/batch_distribution_entropy": 0.7497535770739696,
|
|
"calibration/batch_entropy_100bins": 0.8134197295262965,
|
|
"calibration/batch_entropy_10bins": 0.7497535770739696,
|
|
"calibration/batch_entropy_50bins": 0.8202766103654543,
|
|
"calibration/batch_uniqueness": 0.9356761553909309,
|
|
"calibration/buffer_distribution_entropy": 0.8144947209468967,
|
|
"calibration/buffer_entropy_100bins": 0.8727833543328449,
|
|
"calibration/buffer_entropy_10bins": 0.8144947209468967,
|
|
"calibration/buffer_entropy_50bins": 0.876997242538901,
|
|
"calibration/confidence_entropy": 0.5126060799233956,
|
|
"calibration/coverage@0%": 0.013113817192110735,
|
|
"calibration/coverage@1%": 0.013113817192110735,
|
|
"calibration/coverage@10%": 0.5760200171226603,
|
|
"calibration/coverage@15%": 0.8476765097999672,
|
|
"calibration/coverage@20%": 0.9179676956091463,
|
|
"calibration/coverage@25%": 0.9608355091383812,
|
|
"calibration/coverage@30%": 0.981201044386423,
|
|
"calibration/coverage@5%": 0.24902806695100796,
|
|
"calibration/distribution_entropy_10": 0.7497535770739696,
|
|
"calibration/distribution_entropy_100": 0.8134197295262965,
|
|
"calibration/ece": 0.12372607921773937,
|
|
"calibration/mean_confidence": 0.6789990075283177,
|
|
"calibration/unique_confidence_per_question": 0.19635416666666666,
|
|
"calibration/unique_confidences": 75.4,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.010243055555555537,
|
|
"completions/max_length": 3533.2,
|
|
"completions/max_terminated_length": 3533.2,
|
|
"completions/mean_length": 687.0453247070312,
|
|
"completions/mean_terminated_length": 694.14697265625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 214.8,
|
|
"epoch": 0.4319946000674992,
|
|
"grad_norm": 0.0003923497861251235,
|
|
"learning_rate": 8.433734939759036e-07,
|
|
"loss": -0.009,
|
|
"num_tokens": 384698185.0,
|
|
"reward": 1.0713138580322266,
|
|
"reward_std": 0.12391123622655868,
|
|
"rewards/accuracy_reward": 0.7142361044883728,
|
|
"rewards/brier_reward": 0.8178089022636413,
|
|
"rewards/confidence_uniqueness_reward": 0.9249926686286927,
|
|
"rewards/format_reward": 0.9896701335906982,
|
|
"rewards/frontier_aurc_reward": -0.0023395067546516657,
|
|
"rewards/frontier_coverage_0": 0.009305649372981862,
|
|
"rewards/frontier_coverage_1": 0.009305649372981862,
|
|
"rewards/frontier_coverage_10": 0.009305649372981862,
|
|
"rewards/frontier_coverage_15": 0.013631703774444759,
|
|
"rewards/frontier_coverage_20": 0.022413885779678823,
|
|
"rewards/frontier_coverage_25": 0.05289755538105965,
|
|
"rewards/frontier_coverage_5": 0.009305649372981862,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0033940633293241262,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.1560980886220932,
|
|
"signal/accuracy_reward/group_std_mean": 0.20446191132068633,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.41666666865348817,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0780490443110466,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0780490443110466,
|
|
"signal/advantage_abs_mean": 0.09040587842464447,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09040587842464447,
|
|
"signal/advantage_pre_scale_std": 0.1639217257499695,
|
|
"signal/advantage_std": 0.1639217257499695,
|
|
"signal/brier_reward/centered_abs_mean": 0.1156775861978531,
|
|
"signal/brier_reward/group_std_mean": 0.15296037197113038,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014459698274731637,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014459698274731637,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04092123620212078,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06180307194590569,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005115154525265097,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005115154525265097,
|
|
"signal/format_reward/centered_abs_mean": 0.017681206576526165,
|
|
"signal/format_reward/group_std_mean": 0.03354543596506119,
|
|
"signal/format_reward/group_zero_std_frac": 0.8611111044883728,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.008840603288263083,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.008840603288263083,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.002855647308751941,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.004864505957812071,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.4619489199249077e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.4619489199249077e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.12355190813541413,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.16903219521045684,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.12355190813541413,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.16903219521045684,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12355190813541413,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.16903219521045684,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.08319898694753647,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.11640357077121735,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0012999841710552573,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0012999841710552573,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04654121547937393,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.06476506888866425,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007272064918652177,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007272064918652177,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.048268646001815796,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.06180224493145943,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0007541975937783718,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0007541975937783718,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.12355190813541413,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.16903219521045684,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0019304985646158457,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0031099628657102587,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.004470847826451063,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00038874535821378233,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00038874535821378233,
|
|
"step": 180
|
|
},
|
|
{
|
|
"calibration/aurc": 0.16988037415763974,
|
|
"calibration/batch_distribution_entropy": 0.7940710514827269,
|
|
"calibration/batch_entropy_100bins": 0.8368481344782455,
|
|
"calibration/batch_entropy_10bins": 0.7940710514827269,
|
|
"calibration/batch_entropy_50bins": 0.8476841669994413,
|
|
"calibration/batch_uniqueness": 0.9411219051737953,
|
|
"calibration/buffer_distribution_entropy": 0.8081097969439645,
|
|
"calibration/buffer_entropy_100bins": 0.8695779315744969,
|
|
"calibration/buffer_entropy_10bins": 0.8081097969439645,
|
|
"calibration/buffer_entropy_50bins": 0.873141871107264,
|
|
"calibration/confidence_entropy": 0.5195127592582154,
|
|
"calibration/coverage@0%": 0.022429873118014296,
|
|
"calibration/coverage@1%": 0.022429873118014296,
|
|
"calibration/coverage@10%": 0.15776704580497936,
|
|
"calibration/coverage@15%": 0.47283910007332636,
|
|
"calibration/coverage@20%": 0.8386187835365227,
|
|
"calibration/coverage@25%": 0.8956120693790559,
|
|
"calibration/coverage@30%": 0.9506561679790027,
|
|
"calibration/coverage@5%": 0.044304873118014294,
|
|
"calibration/distribution_entropy_10": 0.7940710514827269,
|
|
"calibration/distribution_entropy_100": 0.8368481344782455,
|
|
"calibration/ece": 0.11371638934936182,
|
|
"calibration/mean_confidence": 0.6561408147419053,
|
|
"calibration/unique_confidence_per_question": 0.2109375,
|
|
"calibration/unique_confidences": 81.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00980902777777779,
|
|
"completions/max_length": 3568.8,
|
|
"completions/max_terminated_length": 3568.8,
|
|
"completions/mean_length": 694.918212890625,
|
|
"completions/mean_terminated_length": 701.8487915039062,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 187.8,
|
|
"epoch": 0.44399445006937416,
|
|
"grad_norm": 0.0004112598253414035,
|
|
"learning_rate": 6.927710843373495e-07,
|
|
"loss": -0.0072,
|
|
"num_tokens": 395793691.0,
|
|
"reward": 1.0613837242126465,
|
|
"reward_std": 0.12879109233617783,
|
|
"rewards/accuracy_reward": 0.692187488079071,
|
|
"rewards/brier_reward": 0.8143394708633422,
|
|
"rewards/confidence_uniqueness_reward": 0.9319449424743652,
|
|
"rewards/format_reward": 0.9901041746139526,
|
|
"rewards/frontier_aurc_reward": -0.0021198054775595663,
|
|
"rewards/frontier_coverage_0": 0.01427230816334486,
|
|
"rewards/frontier_coverage_1": 0.01427230816334486,
|
|
"rewards/frontier_coverage_10": 0.01427230816334486,
|
|
"rewards/frontier_coverage_15": 0.016747461259365083,
|
|
"rewards/frontier_coverage_20": 0.02341715954244137,
|
|
"rewards/frontier_coverage_25": 0.05537489578127861,
|
|
"rewards/frontier_coverage_5": 0.01427230816334486,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.003194707864895463,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16829426884651183,
|
|
"signal/accuracy_reward/group_std_mean": 0.2190181851387024,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.38333333730697633,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08414713442325591,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08414713442325591,
|
|
"signal/advantage_abs_mean": 0.09635478109121323,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09635478109121323,
|
|
"signal/advantage_pre_scale_std": 0.1639193296432495,
|
|
"signal/advantage_std": 0.1639193296432495,
|
|
"signal/brier_reward/centered_abs_mean": 0.12017861008644104,
|
|
"signal/brier_reward/group_std_mean": 0.15563611090183258,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01502232626080513,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01502232626080513,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.037514998018741606,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05578840374946594,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004689374752342701,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004689374752342701,
|
|
"signal/format_reward/centered_abs_mean": 0.016894531436264515,
|
|
"signal/format_reward/group_std_mean": 0.030554963275790215,
|
|
"signal/format_reward/group_zero_std_frac": 0.8777777791023255,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.008447265718132257,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.008447265718132257,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.002720799436792731,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.004833174217492342,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.2512491199886425e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.2512491199886425e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.130600044131279,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.18123140037059784,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.130600044131279,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.18123140037059784,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.130600044131279,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.18123140037059784,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.08408356457948685,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.11986269503831863,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001313805696554482,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001313805696554482,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.0467585064470768,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.06627060770988465,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.000730601663235575,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.000730601663235575,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.04904806688427925,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.06319007501006127,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0007663760450668633,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0007663760450668633,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.130600044131279,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.18123140037059784,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.002040625689551234,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.003226568968966603,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.004763441625982523,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0004033211211208254,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0004033211211208254,
|
|
"step": 185
|
|
},
|
|
{
|
|
"calibration/aurc": 0.16887821111438306,
|
|
"calibration/batch_distribution_entropy": 0.7166309619833362,
|
|
"calibration/batch_entropy_100bins": 0.793829835616982,
|
|
"calibration/batch_entropy_10bins": 0.7166309619833362,
|
|
"calibration/batch_entropy_50bins": 0.7992505076218754,
|
|
"calibration/batch_uniqueness": 0.9261153678295246,
|
|
"calibration/buffer_distribution_entropy": 0.800176021818779,
|
|
"calibration/buffer_entropy_100bins": 0.8647032984957514,
|
|
"calibration/buffer_entropy_10bins": 0.800176021818779,
|
|
"calibration/buffer_entropy_50bins": 0.8680643724938966,
|
|
"calibration/confidence_entropy": 0.4969127807971727,
|
|
"calibration/coverage@0%": 0.013662280701754386,
|
|
"calibration/coverage@1%": 0.013662280701754386,
|
|
"calibration/coverage@10%": 0.38303179824561406,
|
|
"calibration/coverage@15%": 0.5142105263157895,
|
|
"calibration/coverage@20%": 0.5489473684210526,
|
|
"calibration/coverage@25%": 0.938843201754386,
|
|
"calibration/coverage@30%": 0.9942105263157895,
|
|
"calibration/coverage@5%": 0.0999780701754386,
|
|
"calibration/distribution_entropy_10": 0.7166309619833362,
|
|
"calibration/distribution_entropy_100": 0.793829835616982,
|
|
"calibration/ece": 0.11243645289887176,
|
|
"calibration/mean_confidence": 0.6973970449081458,
|
|
"calibration/unique_confidence_per_question": 0.18958333333333335,
|
|
"calibration/unique_confidences": 72.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00512152777777779,
|
|
"completions/max_length": 3413.8,
|
|
"completions/max_terminated_length": 3413.8,
|
|
"completions/mean_length": 683.5370727539063,
|
|
"completions/mean_terminated_length": 687.0613037109375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 192.6,
|
|
"epoch": 0.45599430007124914,
|
|
"grad_norm": 0.0003837902913801372,
|
|
"learning_rate": 5.421686746987952e-07,
|
|
"loss": -0.0035,
|
|
"num_tokens": 406750982.0,
|
|
"reward": 1.0861162424087525,
|
|
"reward_std": 0.11900279521942139,
|
|
"rewards/accuracy_reward": 0.7318576335906982,
|
|
"rewards/brier_reward": 0.8348490834236145,
|
|
"rewards/confidence_uniqueness_reward": 0.92904052734375,
|
|
"rewards/format_reward": 0.9947916746139527,
|
|
"rewards/frontier_aurc_reward": -0.0019866148009896278,
|
|
"rewards/frontier_coverage_0": 0.013782516145147384,
|
|
"rewards/frontier_coverage_1": 0.013782516145147384,
|
|
"rewards/frontier_coverage_10": 0.014211940788663923,
|
|
"rewards/frontier_coverage_15": 0.018956656288355588,
|
|
"rewards/frontier_coverage_20": 0.02874315045773983,
|
|
"rewards/frontier_coverage_25": 0.07212998867034912,
|
|
"rewards/frontier_coverage_5": 0.013782516145147384,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.003232320211827755,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.16196288764476777,
|
|
"signal/accuracy_reward/group_std_mean": 0.21733182072639465,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.3694444477558136,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08098144382238388,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08098144382238388,
|
|
"signal/advantage_abs_mean": 0.08545112162828446,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08545112162828446,
|
|
"signal/advantage_pre_scale_std": 0.15151307582855225,
|
|
"signal/advantage_std": 0.15151307582855225,
|
|
"signal/brier_reward/centered_abs_mean": 0.10914837270975113,
|
|
"signal/brier_reward/group_std_mean": 0.14498610198497772,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01364354658871889,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01364354658871889,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03477521277964115,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05167415216565132,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004346901597455144,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004346901597455144,
|
|
"signal/format_reward/centered_abs_mean": 0.009429253498092293,
|
|
"signal/format_reward/group_std_mean": 0.021391174383461477,
|
|
"signal/format_reward/group_zero_std_frac": 0.8972222208976746,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.004714626749046147,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.004714626749046147,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0025213475339114664,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.004476012662053108,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.939605521736666e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.939605521736666e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.13185926526784897,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.18062789738178253,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.00206030101981014,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.00206030101981014,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.13185926526784897,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.18062789738178253,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.00206030101981014,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.00206030101981014,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12977752983570098,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1778223305940628,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.002027773903682828,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.002027773903682828,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.08260580152273178,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.11576226651668549,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001290715648792684,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001290715648792684,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04762231633067131,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.06589499711990357,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007440986926667392,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007440986926667392,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.05049858167767525,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.06524901390075684,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0007890403387136758,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0007890403387136758,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.13185926526784897,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.18062789738178253,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.00206030101981014,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.00206030101981014,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.003113184357061982,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.004526341799646616,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00038914804463274777,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00038914804463274777,
|
|
"step": 190
|
|
},
|
|
{
|
|
"calibration/aurc": 0.2062296307283377,
|
|
"calibration/batch_distribution_entropy": 0.8093306883536086,
|
|
"calibration/batch_entropy_100bins": 0.8432700346966481,
|
|
"calibration/batch_entropy_10bins": 0.8093306883536086,
|
|
"calibration/batch_entropy_50bins": 0.859267476518579,
|
|
"calibration/batch_uniqueness": 0.9426119233178885,
|
|
"calibration/buffer_distribution_entropy": 0.7945642543079567,
|
|
"calibration/buffer_entropy_100bins": 0.8605465543658124,
|
|
"calibration/buffer_entropy_10bins": 0.7945642543079567,
|
|
"calibration/buffer_entropy_50bins": 0.8645292173196768,
|
|
"calibration/confidence_entropy": 0.5176270953297042,
|
|
"calibration/coverage@0%": 0.004699738903394255,
|
|
"calibration/coverage@1%": 0.004699738903394255,
|
|
"calibration/coverage@10%": 0.28507615891758575,
|
|
"calibration/coverage@15%": 0.4576479260387029,
|
|
"calibration/coverage@20%": 0.5506327631461108,
|
|
"calibration/coverage@25%": 0.617482242623319,
|
|
"calibration/coverage@30%": 0.772239810615784,
|
|
"calibration/coverage@5%": 0.09712793733681462,
|
|
"calibration/distribution_entropy_10": 0.8093306883536086,
|
|
"calibration/distribution_entropy_100": 0.8432700346966481,
|
|
"calibration/ece": 0.13137105508504543,
|
|
"calibration/mean_confidence": 0.6472678487210273,
|
|
"calibration/unique_confidence_per_question": 0.21197916666666666,
|
|
"calibration/unique_confidences": 81.4,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.011545138888888884,
|
|
"completions/max_length": 3631.2,
|
|
"completions/max_terminated_length": 3631.2,
|
|
"completions/mean_length": 703.418408203125,
|
|
"completions/mean_terminated_length": 711.7278564453125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 189.0,
|
|
"epoch": 0.46799415007312406,
|
|
"grad_norm": 0.00042763527017086744,
|
|
"learning_rate": 3.91566265060241e-07,
|
|
"loss": -0.011,
|
|
"num_tokens": 417935226.0,
|
|
"reward": 1.0512622594833374,
|
|
"reward_std": 0.12501877546310425,
|
|
"rewards/accuracy_reward": 0.6756076455116272,
|
|
"rewards/brier_reward": 0.8063122510910035,
|
|
"rewards/confidence_uniqueness_reward": 0.9281278252601624,
|
|
"rewards/format_reward": 0.9883680462837219,
|
|
"rewards/frontier_aurc_reward": -0.0024975771084427834,
|
|
"rewards/frontier_coverage_0": 0.018676279671490194,
|
|
"rewards/frontier_coverage_1": 0.018676279671490194,
|
|
"rewards/frontier_coverage_10": 0.01876285169273615,
|
|
"rewards/frontier_coverage_15": 0.0209655387327075,
|
|
"rewards/frontier_coverage_20": 0.02632690779864788,
|
|
"rewards/frontier_coverage_25": 0.06297426149249077,
|
|
"rewards/frontier_coverage_5": 0.018676279671490194,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0030649449676275254,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.156504987180233,
|
|
"signal/accuracy_reward/group_std_mean": 0.20543068647384644,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4194444477558136,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0782524935901165,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0782524935901165,
|
|
"signal/advantage_abs_mean": 0.09245173931121826,
|
|
"signal/advantage_pre_scale_abs_mean": 0.09245173931121826,
|
|
"signal/advantage_pre_scale_std": 0.1626460701227188,
|
|
"signal/advantage_std": 0.1626460701227188,
|
|
"signal/brier_reward/centered_abs_mean": 0.1210327297449112,
|
|
"signal/brier_reward/group_std_mean": 0.15611167550086974,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0151290912181139,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.0151290912181139,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04194310083985329,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.06149864494800568,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005242887604981661,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005242887604981661,
|
|
"signal/format_reward/centered_abs_mean": 0.01963975690305233,
|
|
"signal/format_reward/group_std_mean": 0.03504836894571781,
|
|
"signal/format_reward/group_zero_std_frac": 0.8611111164093017,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.009819878451526164,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.009819878451526164,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.003186128893867135,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.005603937339037657,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.9783263966673986e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.9783263966673986e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.12809589505195618,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.17424156665802001,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0020014983601868153,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0020014983601868153,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.12809589505195618,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.17424156665802001,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0020014983601868153,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0020014983601868153,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12744116485118867,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.17343433499336242,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001991268200799823,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001991268200799823,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.07363787293434143,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.10297145694494247,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0011505917645990849,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0011505917645990849,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.044591452926397324,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.06145058870315552,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0006967414519749582,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0006967414519749582,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.052361331135034564,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.0672803521156311,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0008181457989849151,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0008181457989849151,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.12809589505195618,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.17424156665802001,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0020014983601868153,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0020014983601868153,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0031045635230839254,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.004452465567737818,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0003880704403854907,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0003880704403854907,
|
|
"step": 195
|
|
},
|
|
{
|
|
"calibration/aurc": 0.18052983140791248,
|
|
"calibration/batch_distribution_entropy": 0.7398158763396785,
|
|
"calibration/batch_entropy_100bins": 0.8093075678083164,
|
|
"calibration/batch_entropy_10bins": 0.7398158763396785,
|
|
"calibration/batch_entropy_50bins": 0.8162096039603671,
|
|
"calibration/batch_uniqueness": 0.9336981234577946,
|
|
"calibration/buffer_distribution_entropy": 0.7927739625462163,
|
|
"calibration/buffer_entropy_100bins": 0.8589941803683226,
|
|
"calibration/buffer_entropy_10bins": 0.7927739625462163,
|
|
"calibration/buffer_entropy_50bins": 0.8634857187639527,
|
|
"calibration/confidence_entropy": 0.5153811073665431,
|
|
"calibration/coverage@0%": 0.0062856144931519396,
|
|
"calibration/coverage@1%": 0.0062856144931519396,
|
|
"calibration/coverage@10%": 0.1866731368237827,
|
|
"calibration/coverage@15%": 0.38207938252943063,
|
|
"calibration/coverage@20%": 0.838360745614035,
|
|
"calibration/coverage@25%": 0.9291008771929825,
|
|
"calibration/coverage@30%": 0.9573848684210526,
|
|
"calibration/coverage@5%": 0.0062856144931519396,
|
|
"calibration/distribution_entropy_10": 0.7398158763396785,
|
|
"calibration/distribution_entropy_100": 0.8093075678083164,
|
|
"calibration/ece": 0.109616134077216,
|
|
"calibration/mean_confidence": 0.6974732587107646,
|
|
"calibration/unique_confidence_per_question": 0.19479166666666664,
|
|
"calibration/unique_confidences": 74.8,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.007899305555555559,
|
|
"completions/max_length": 3355.0,
|
|
"completions/max_terminated_length": 3355.0,
|
|
"completions/mean_length": 681.3518310546875,
|
|
"completions/mean_terminated_length": 686.8287475585937,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 195.0,
|
|
"epoch": 0.47999400007499904,
|
|
"grad_norm": 0.0004486938123591244,
|
|
"learning_rate": 2.409638554216868e-07,
|
|
"loss": -0.0059,
|
|
"num_tokens": 428852207.0,
|
|
"reward": 1.070695161819458,
|
|
"reward_std": 0.11748828440904617,
|
|
"rewards/accuracy_reward": 0.7063368082046508,
|
|
"rewards/brier_reward": 0.8224664568901062,
|
|
"rewards/confidence_uniqueness_reward": 0.9281678915023803,
|
|
"rewards/format_reward": 0.9921006917953491,
|
|
"rewards/frontier_aurc_reward": -0.002603556914255023,
|
|
"rewards/frontier_coverage_0": 0.01670327754691243,
|
|
"rewards/frontier_coverage_1": 0.01670327754691243,
|
|
"rewards/frontier_coverage_10": 0.016874231677502394,
|
|
"rewards/frontier_coverage_15": 0.021660929918289183,
|
|
"rewards/frontier_coverage_20": 0.030010566860437394,
|
|
"rewards/frontier_coverage_25": 0.07551120072603226,
|
|
"rewards/frontier_coverage_5": 0.01670327754691243,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0027688577305525542,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.14654405266046525,
|
|
"signal/accuracy_reward/group_std_mean": 0.19502569139003753,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4277777910232544,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07327202633023262,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07327202633023262,
|
|
"signal/advantage_abs_mean": 0.08604095876216888,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08604095876216888,
|
|
"signal/advantage_pre_scale_std": 0.15435749292373657,
|
|
"signal/advantage_std": 0.15435749292373657,
|
|
"signal/brier_reward/centered_abs_mean": 0.10942392647266388,
|
|
"signal/brier_reward/group_std_mean": 0.14457024335861207,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013677990809082986,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.013677990809082986,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03794047012925148,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.055311404168605804,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004742558766156435,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004742558766156435,
|
|
"signal/format_reward/centered_abs_mean": 0.01360134556889534,
|
|
"signal/format_reward/group_std_mean": 0.025955809652805327,
|
|
"signal/format_reward/group_zero_std_frac": 0.8916666746139527,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00680067278444767,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.00680067278444767,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.003289140481501818,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.005822925828397274,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 5.1392820023465904e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 5.1392820023465904e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.1157380223274231,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.16240898966789247,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.001808406598865986,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.001808406598865986,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.1157380223274231,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.16240898966789247,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.001808406598865986,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.001808406598865986,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.11215617209672928,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.15783893167972565,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.001752440189011395,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.001752440189011395,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.06608396619558335,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.09578151851892472,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0010325619718059898,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0010325619718059898,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04160864725708961,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.058051402866840365,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0006501351133920252,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0006501351133920252,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.05538794472813606,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.07133743911981583,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0008654366363771259,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0008654366363771259,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.1157380223274231,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.16240898966789247,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.001808406598865986,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.001808406598865986,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0029893687460571527,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.004549006605520845,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0003736710932571441,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0003736710932571441,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.47999400007499904,
|
|
"eval_completions/clipped_ratio": 0.008680555555555544,
|
|
"eval_completions/max_length": 2405.5,
|
|
"eval_completions/max_terminated_length": 2405.5,
|
|
"eval_completions/mean_length": 691.6689249674479,
|
|
"eval_completions/mean_terminated_length": 697.7305806477865,
|
|
"eval_completions/min_length": 56.833333333333336,
|
|
"eval_completions/min_terminated_length": 249.66666666666666,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 428852207.0,
|
|
"eval_reward": 1.0510073900222778,
|
|
"eval_reward_std": 0.2505972956617673,
|
|
"eval_rewards/accuracy_reward": 0.6796875,
|
|
"eval_rewards/brier_reward": 0.8202938636144003,
|
|
"eval_rewards/confidence_uniqueness_reward": 0.8755098978678385,
|
|
"eval_rewards/format_reward": 0.9913194378217062,
|
|
"eval_rewards/frontier_aurc_reward": -0.002243300104358544,
|
|
"eval_rewards/frontier_coverage_0": 0.029487861630817253,
|
|
"eval_rewards/frontier_coverage_1": 0.029487861630817253,
|
|
"eval_rewards/frontier_coverage_10": 0.02955207011351983,
|
|
"eval_rewards/frontier_coverage_15": 0.02661541321625312,
|
|
"eval_rewards/frontier_coverage_20": 0.03148760460317135,
|
|
"eval_rewards/frontier_coverage_25": 0.0767225877692302,
|
|
"eval_rewards/frontier_coverage_5": 0.029487861630817253,
|
|
"eval_rewards/true_frontier_ece_gap_only_reward": -0.0030973663087934256,
|
|
"eval_runtime": 184.977,
|
|
"eval_samples_per_second": 5.406,
|
|
"eval_signal/accuracy_reward/centered_abs_mean": 0.4178059895833333,
|
|
"eval_signal/accuracy_reward/group_std_mean": 0.4625024398167928,
|
|
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.20890299479166666,
|
|
"eval_signal/accuracy_reward/weight": 0.5,
|
|
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.20890299479166666,
|
|
"eval_signal/advantage_abs_mean": 0.21793826669454575,
|
|
"eval_signal/advantage_pre_scale_abs_mean": 0.21793826669454575,
|
|
"eval_signal/advantage_pre_scale_std": 0.24974885831276575,
|
|
"eval_signal/advantage_std": 0.24974885831276575,
|
|
"eval_signal/brier_reward/centered_abs_mean": 0.18156319856643677,
|
|
"eval_signal/brier_reward/group_std_mean": 0.2326709752281507,
|
|
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.022695399820804596,
|
|
"eval_signal/brier_reward/weight": 0.125,
|
|
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.022695399820804596,
|
|
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.0633502888182799,
|
|
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.08953885920345783,
|
|
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.007918786102284988,
|
|
"eval_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007918786102284988,
|
|
"eval_signal/format_reward/centered_abs_mean": 0.01671006918574373,
|
|
"eval_signal/format_reward/group_std_mean": 0.04611522859583298,
|
|
"eval_signal/format_reward/group_zero_std_frac": 0.750000019868215,
|
|
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.008355034592871865,
|
|
"eval_signal/format_reward/weight": 0.5,
|
|
"eval_signal/format_reward/weighted_centered_abs_mean": 0.008355034592871865,
|
|
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.0037845204351469874,
|
|
"eval_signal/frontier_aurc_reward/group_std_mean": 0.00772972172126174,
|
|
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 5.913313179917168e-05,
|
|
"eval_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 5.913313179917168e-05,
|
|
"eval_signal/frontier_coverage_0/centered_abs_mean": 0.1768440529704094,
|
|
"eval_signal/frontier_coverage_0/group_std_mean": 0.275893231232961,
|
|
"eval_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0027631883276626468,
|
|
"eval_signal/frontier_coverage_0/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0027631883276626468,
|
|
"eval_signal/frontier_coverage_1/centered_abs_mean": 0.1768440529704094,
|
|
"eval_signal/frontier_coverage_1/group_std_mean": 0.275893231232961,
|
|
"eval_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0027631883276626468,
|
|
"eval_signal/frontier_coverage_1/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0027631883276626468,
|
|
"eval_signal/frontier_coverage_10/centered_abs_mean": 0.17100265125433603,
|
|
"eval_signal/frontier_coverage_10/group_std_mean": 0.2681894302368164,
|
|
"eval_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0026719164258490005,
|
|
"eval_signal/frontier_coverage_10/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0026719164258490005,
|
|
"eval_signal/frontier_coverage_15/centered_abs_mean": 0.09313676009575526,
|
|
"eval_signal/frontier_coverage_15/group_std_mean": 0.16024632503588995,
|
|
"eval_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001455261876496176,
|
|
"eval_signal/frontier_coverage_15/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001455261876496176,
|
|
"eval_signal/frontier_coverage_20/centered_abs_mean": 0.05307722526292006,
|
|
"eval_signal/frontier_coverage_20/group_std_mean": 0.08430640151103337,
|
|
"eval_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0008293316447331259,
|
|
"eval_signal/frontier_coverage_20/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0008293316447331259,
|
|
"eval_signal/frontier_coverage_25/centered_abs_mean": 0.09919273108243942,
|
|
"eval_signal/frontier_coverage_25/group_std_mean": 0.11935225501656532,
|
|
"eval_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.001549886423163116,
|
|
"eval_signal/frontier_coverage_25/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.001549886423163116,
|
|
"eval_signal/frontier_coverage_5/centered_abs_mean": 0.1768440529704094,
|
|
"eval_signal/frontier_coverage_5/group_std_mean": 0.275893231232961,
|
|
"eval_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"eval_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0027631883276626468,
|
|
"eval_signal/frontier_coverage_5/weight": 0.015625,
|
|
"eval_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0027631883276626468,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.00442255346570164,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.0076944112467269106,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.000552819183212705,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"eval_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.000552819183212705,
|
|
"eval_steps_per_second": 0.032,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.47999400007499904,
|
|
"step": 200,
|
|
"train_probe_completions/clipped_ratio": 0.012847222222222232,
|
|
"train_probe_completions/max_length": 2355.3333333333335,
|
|
"train_probe_completions/max_terminated_length": 2355.3333333333335,
|
|
"train_probe_completions/mean_length": 680.9791870117188,
|
|
"train_probe_completions/mean_terminated_length": 689.9359436035156,
|
|
"train_probe_completions/min_length": 0.0,
|
|
"train_probe_completions/min_terminated_length": 217.16666666666666,
|
|
"train_probe_loss": 0.0,
|
|
"train_probe_num_tokens": 428852207.0,
|
|
"train_probe_reward": 1.067217191060384,
|
|
"train_probe_reward_std": 0.25106702248255414,
|
|
"train_probe_rewards/accuracy_reward": 0.7170139054457346,
|
|
"train_probe_rewards/brier_reward": 0.8235729734102885,
|
|
"train_probe_rewards/confidence_uniqueness_reward": 0.8701435724894205,
|
|
"train_probe_rewards/format_reward": 0.9895833333333334,
|
|
"train_probe_rewards/frontier_aurc_reward": -0.002555853434993575,
|
|
"train_probe_rewards/frontier_coverage_0": 0.00876838636274139,
|
|
"train_probe_rewards/frontier_coverage_1": 0.00876838636274139,
|
|
"train_probe_rewards/frontier_coverage_10": 0.009510708196709553,
|
|
"train_probe_rewards/frontier_coverage_15": 0.017479141689060878,
|
|
"train_probe_rewards/frontier_coverage_20": 0.02941159127900998,
|
|
"train_probe_rewards/frontier_coverage_25": 0.08159822722276051,
|
|
"train_probe_rewards/frontier_coverage_5": 0.00876838636274139,
|
|
"train_probe_rewards/true_frontier_ece_gap_only_reward": -0.002586768241599202,
|
|
"train_probe_runtime": 203.222,
|
|
"train_probe_samples_per_second": 4.921,
|
|
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.3974609375,
|
|
"train_probe_signal/accuracy_reward/group_std_mean": 0.4516189793745677,
|
|
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19873046875,
|
|
"train_probe_signal/accuracy_reward/weight": 0.5,
|
|
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.19873046875,
|
|
"train_probe_signal/advantage_abs_mean": 0.21062870572010675,
|
|
"train_probe_signal/advantage_pre_scale_abs_mean": 0.21062870572010675,
|
|
"train_probe_signal/advantage_pre_scale_std": 0.24998972316582999,
|
|
"train_probe_signal/advantage_std": 0.24998972316582999,
|
|
"train_probe_signal/brier_reward/centered_abs_mean": 0.1768066460887591,
|
|
"train_probe_signal/brier_reward/group_std_mean": 0.2324258784453074,
|
|
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.022100830761094887,
|
|
"train_probe_signal/brier_reward/weight": 0.125,
|
|
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.022100830761094887,
|
|
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.06669201205174129,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.1005245956281821,
|
|
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.00833650150646766,
|
|
"train_probe_signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.00833650150646766,
|
|
"train_probe_signal/format_reward/centered_abs_mean": 0.02007378451526165,
|
|
"train_probe_signal/format_reward/group_std_mean": 0.05593615584075451,
|
|
"train_probe_signal/format_reward/group_zero_std_frac": 0.6944444676240286,
|
|
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.010036892257630825,
|
|
"train_probe_signal/format_reward/weight": 0.5,
|
|
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.010036892257630825,
|
|
"train_probe_signal/frontier_aurc_reward/centered_abs_mean": 0.004501550691202283,
|
|
"train_probe_signal/frontier_aurc_reward/group_std_mean": 0.010145407247667512,
|
|
"train_probe_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 7.033672955003567e-05,
|
|
"train_probe_signal/frontier_aurc_reward/weight": 0.015625,
|
|
"train_probe_signal/frontier_aurc_reward/weighted_centered_abs_mean": 7.033672955003567e-05,
|
|
"train_probe_signal/frontier_coverage_0/centered_abs_mean": 0.1579719434181849,
|
|
"train_probe_signal/frontier_coverage_0/group_std_mean": 0.25707169622182846,
|
|
"train_probe_signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.002468311615909139,
|
|
"train_probe_signal/frontier_coverage_0/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_0/weighted_centered_abs_mean": 0.002468311615909139,
|
|
"train_probe_signal/frontier_coverage_1/centered_abs_mean": 0.1579719434181849,
|
|
"train_probe_signal/frontier_coverage_1/group_std_mean": 0.25707169622182846,
|
|
"train_probe_signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.002468311615909139,
|
|
"train_probe_signal/frontier_coverage_1/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_1/weighted_centered_abs_mean": 0.002468311615909139,
|
|
"train_probe_signal/frontier_coverage_10/centered_abs_mean": 0.15405935049057007,
|
|
"train_probe_signal/frontier_coverage_10/group_std_mean": 0.25156734387079877,
|
|
"train_probe_signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0024071773514151573,
|
|
"train_probe_signal/frontier_coverage_10/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0024071773514151573,
|
|
"train_probe_signal/frontier_coverage_15/centered_abs_mean": 0.08175658682982127,
|
|
"train_probe_signal/frontier_coverage_15/group_std_mean": 0.14768946915864944,
|
|
"train_probe_signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0012774466692159574,
|
|
"train_probe_signal/frontier_coverage_15/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0012774466692159574,
|
|
"train_probe_signal/frontier_coverage_20/centered_abs_mean": 0.04819720300535361,
|
|
"train_probe_signal/frontier_coverage_20/group_std_mean": 0.07652890309691429,
|
|
"train_probe_signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007530812969586501,
|
|
"train_probe_signal/frontier_coverage_20/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007530812969586501,
|
|
"train_probe_signal/frontier_coverage_25/centered_abs_mean": 0.09637204806009929,
|
|
"train_probe_signal/frontier_coverage_25/group_std_mean": 0.11713628967603047,
|
|
"train_probe_signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0015058132509390514,
|
|
"train_probe_signal/frontier_coverage_25/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0015058132509390514,
|
|
"train_probe_signal/frontier_coverage_5/centered_abs_mean": 0.1579719434181849,
|
|
"train_probe_signal/frontier_coverage_5/group_std_mean": 0.25707169622182846,
|
|
"train_probe_signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.002468311615909139,
|
|
"train_probe_signal/frontier_coverage_5/weight": 0.015625,
|
|
"train_probe_signal/frontier_coverage_5/weighted_centered_abs_mean": 0.002468311615909139,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0036215446889400482,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.006501481092224519,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00045269308611750603,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"train_probe_signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00045269308611750603,
|
|
"train_probe_steps_per_second": 0.03
|
|
},
|
|
{
|
|
"calibration/aurc": 0.16907717783248571,
|
|
"calibration/batch_distribution_entropy": 0.7094331746984237,
|
|
"calibration/batch_entropy_100bins": 0.7936049195126413,
|
|
"calibration/batch_entropy_10bins": 0.7094331746984237,
|
|
"calibration/batch_entropy_50bins": 0.7939230371346102,
|
|
"calibration/batch_uniqueness": 0.924174709187571,
|
|
"calibration/buffer_distribution_entropy": 0.7921673058072191,
|
|
"calibration/buffer_entropy_100bins": 0.8584895422787383,
|
|
"calibration/buffer_entropy_10bins": 0.7921673058072191,
|
|
"calibration/buffer_entropy_50bins": 0.8628322959175796,
|
|
"calibration/confidence_entropy": 0.49239157203877826,
|
|
"calibration/coverage@0%": 0.010430265448215839,
|
|
"calibration/coverage@1%": 0.010430265448215839,
|
|
"calibration/coverage@10%": 0.11064104656222802,
|
|
"calibration/coverage@15%": 0.5501699847693647,
|
|
"calibration/coverage@20%": 0.8597367275892079,
|
|
"calibration/coverage@25%": 0.8936221714534378,
|
|
"calibration/coverage@30%": 0.9400198542210617,
|
|
"calibration/coverage@5%": 0.010430265448215839,
|
|
"calibration/distribution_entropy_10": 0.7094331746984237,
|
|
"calibration/distribution_entropy_100": 0.7936049195126413,
|
|
"calibration/ece": 0.10543889063662855,
|
|
"calibration/mean_confidence": 0.6970287687410912,
|
|
"calibration/unique_confidence_per_question": 0.19322916666666667,
|
|
"calibration/unique_confidences": 74.2,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.008767361111111116,
|
|
"completions/max_length": 2892.6,
|
|
"completions/max_terminated_length": 2892.6,
|
|
"completions/mean_length": 690.477685546875,
|
|
"completions/mean_terminated_length": 696.5508056640625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 157.6,
|
|
"epoch": 0.491993850076874,
|
|
"grad_norm": 0.0003582312201615423,
|
|
"learning_rate": 9.036144578313253e-08,
|
|
"loss": -0.0072,
|
|
"num_tokens": 439872462.0,
|
|
"reward": 1.092501163482666,
|
|
"reward_std": 0.11384595781564713,
|
|
"rewards/accuracy_reward": 0.7523437380790711,
|
|
"rewards/brier_reward": 0.828589677810669,
|
|
"rewards/confidence_uniqueness_reward": 0.9259551286697387,
|
|
"rewards/format_reward": 0.9912326335906982,
|
|
"rewards/frontier_aurc_reward": -0.00198111105710268,
|
|
"rewards/frontier_coverage_0": -0.00452432045713067,
|
|
"rewards/frontier_coverage_1": -0.00452432045713067,
|
|
"rewards/frontier_coverage_10": -0.003602027613669634,
|
|
"rewards/frontier_coverage_15": 0.012093347311019898,
|
|
"rewards/frontier_coverage_20": 0.03151162005960941,
|
|
"rewards/frontier_coverage_25": 0.09083455055952072,
|
|
"rewards/frontier_coverage_5": -0.00452432045713067,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.0032516193110495805,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.14517686665058135,
|
|
"signal/accuracy_reward/group_std_mean": 0.19360876083374023,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.4444444477558136,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07258843332529068,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07258843332529068,
|
|
"signal/advantage_abs_mean": 0.0818178191781044,
|
|
"signal/advantage_pre_scale_abs_mean": 0.0818178191781044,
|
|
"signal/advantage_pre_scale_std": 0.1523301661014557,
|
|
"signal/advantage_std": 0.1523301661014557,
|
|
"signal/brier_reward/centered_abs_mean": 0.1120417907834053,
|
|
"signal/brier_reward/group_std_mean": 0.14704422652721405,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014005223847925663,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.014005223847925663,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03881465494632721,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.05692438259720802,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004851831868290901,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004851831868290901,
|
|
"signal/format_reward/centered_abs_mean": 0.014360894355922938,
|
|
"signal/format_reward/group_std_mean": 0.027210034802556037,
|
|
"signal/format_reward/group_zero_std_frac": 0.8888888835906983,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007180447177961469,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.007180447177961469,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0026562759652733804,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.005001515662297606,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.150431195739657e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.150431195739657e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.12333370298147202,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.16834968626499175,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.0019270891090855003,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.0019270891090855003,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.12333370298147202,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.16834968626499175,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.0019270891090855003,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.0019270891090855003,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.11967587620019912,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.1636903315782547,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0018699355656281113,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0018699355656281113,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.06968192905187606,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.09741113483905792,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.0010887801414355635,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.0010887801414355635,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04491528794169426,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.060000843554735186,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007018013740889729,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007018013740889729,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.057969672977924346,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.0744464322924614,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0009057761402800679,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0009057761402800679,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.12333370298147202,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.16834968626499175,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.0019270891090855003,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.0019270891090855003,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0032607629895210267,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.004654625337570906,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.00040759537369012834,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.00040759537369012834,
|
|
"step": 205
|
|
},
|
|
{
|
|
"calibration/aurc": 0.1370533125922073,
|
|
"calibration/batch_distribution_entropy": 0.7072075783416373,
|
|
"calibration/batch_entropy_100bins": 0.7965424913173278,
|
|
"calibration/batch_entropy_10bins": 0.7072075783416373,
|
|
"calibration/batch_entropy_50bins": 0.7991626918689265,
|
|
"calibration/batch_uniqueness": 0.9300988239372975,
|
|
"calibration/buffer_distribution_entropy": 0.7932410370024624,
|
|
"calibration/buffer_entropy_100bins": 0.8591231164142057,
|
|
"calibration/buffer_entropy_10bins": 0.7932410370024624,
|
|
"calibration/buffer_entropy_50bins": 0.8632449929488756,
|
|
"calibration/confidence_entropy": 0.5134386595908284,
|
|
"calibration/coverage@0%": 0.007853439020572171,
|
|
"calibration/coverage@1%": 0.007853439020572171,
|
|
"calibration/coverage@10%": 0.21872482437993335,
|
|
"calibration/coverage@15%": 0.7426824379239719,
|
|
"calibration/coverage@20%": 0.8840740183815656,
|
|
"calibration/coverage@25%": 0.9695113893711355,
|
|
"calibration/coverage@30%": 1.0,
|
|
"calibration/coverage@5%": 0.04005535372901429,
|
|
"calibration/distribution_entropy_10": 0.7072075783416373,
|
|
"calibration/distribution_entropy_100": 0.7965424913173278,
|
|
"calibration/ece": 0.07591815575628273,
|
|
"calibration/mean_confidence": 0.7097033007115973,
|
|
"calibration/unique_confidence_per_question": 0.18229166666666666,
|
|
"calibration/unique_confidences": 70.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.005353009259259227,
|
|
"completions/max_length": 3599.0,
|
|
"completions/max_terminated_length": 3599.0,
|
|
"completions/mean_length": 694.3878784179688,
|
|
"completions/mean_terminated_length": 698.0981852213541,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 190.66666666666666,
|
|
"epoch": 0.49919376007799904,
|
|
"num_tokens": 446538119.0,
|
|
"reward": 1.0708950360616047,
|
|
"reward_std": 0.11581993599732716,
|
|
"rewards/accuracy_reward": 0.7042824029922485,
|
|
"rewards/brier_reward": 0.8196952740351359,
|
|
"rewards/confidence_uniqueness_reward": 0.9326375126838684,
|
|
"rewards/format_reward": 0.9945023059844971,
|
|
"rewards/frontier_aurc_reward": -0.002047328627668321,
|
|
"rewards/frontier_coverage_0": 0.012841465882956982,
|
|
"rewards/frontier_coverage_1": 0.012841465882956982,
|
|
"rewards/frontier_coverage_10": 0.012699058279395103,
|
|
"rewards/frontier_coverage_15": 0.017329357874890167,
|
|
"rewards/frontier_coverage_20": 0.0331996213644743,
|
|
"rewards/frontier_coverage_25": 0.0871302808324496,
|
|
"rewards/frontier_coverage_5": 0.012841465882956982,
|
|
"rewards/true_frontier_ece_gap_only_reward": -0.00366590932632486,
|
|
"signal/accuracy_reward/centered_abs_mean": 0.15892650187015533,
|
|
"signal/accuracy_reward/group_std_mean": 0.20545404652754465,
|
|
"signal/accuracy_reward/group_zero_std_frac": 0.43981483578681946,
|
|
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07946325093507767,
|
|
"signal/accuracy_reward/weight": 0.5,
|
|
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07946325093507767,
|
|
"signal/advantage_abs_mean": 0.08434396361311276,
|
|
"signal/advantage_pre_scale_abs_mean": 0.08434396361311276,
|
|
"signal/advantage_pre_scale_std": 0.15035154422124228,
|
|
"signal/advantage_std": 0.15035154422124228,
|
|
"signal/brier_reward/centered_abs_mean": 0.11334347476561864,
|
|
"signal/brier_reward/group_std_mean": 0.14777959883213043,
|
|
"signal/brier_reward/group_zero_std_frac": 0.0,
|
|
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01416793434570233,
|
|
"signal/brier_reward/weight": 0.125,
|
|
"signal/brier_reward/weighted_centered_abs_mean": 0.01416793434570233,
|
|
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03333321896692117,
|
|
"signal/confidence_uniqueness_reward/group_std_mean": 0.051341903706391655,
|
|
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
|
|
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.004166652370865147,
|
|
"signal/confidence_uniqueness_reward/weight": 0.125,
|
|
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004166652370865147,
|
|
"signal/format_reward/centered_abs_mean": 0.010308159670482079,
|
|
"signal/format_reward/group_std_mean": 0.02368570367495219,
|
|
"signal/format_reward/group_zero_std_frac": 0.8888888955116272,
|
|
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.005154079835241039,
|
|
"signal/format_reward/weight": 0.5,
|
|
"signal/format_reward/weighted_centered_abs_mean": 0.005154079835241039,
|
|
"signal/frontier_aurc_reward/centered_abs_mean": 0.0027586812308679023,
|
|
"signal/frontier_aurc_reward/group_std_mean": 0.0049862076217929525,
|
|
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
|
|
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.310439423231097e-05,
|
|
"signal/frontier_aurc_reward/weight": 0.015625,
|
|
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.310439423231097e-05,
|
|
"signal/frontier_coverage_0/centered_abs_mean": 0.1345667690038681,
|
|
"signal/frontier_coverage_0/group_std_mean": 0.18024377524852753,
|
|
"signal/frontier_coverage_0/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_0/scaled_weighted_centered_abs_mean": 0.002102605765685439,
|
|
"signal/frontier_coverage_0/weight": 0.015625,
|
|
"signal/frontier_coverage_0/weighted_centered_abs_mean": 0.002102605765685439,
|
|
"signal/frontier_coverage_1/centered_abs_mean": 0.1345667690038681,
|
|
"signal/frontier_coverage_1/group_std_mean": 0.18024377524852753,
|
|
"signal/frontier_coverage_1/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_1/scaled_weighted_centered_abs_mean": 0.002102605765685439,
|
|
"signal/frontier_coverage_1/weight": 0.015625,
|
|
"signal/frontier_coverage_1/weighted_centered_abs_mean": 0.002102605765685439,
|
|
"signal/frontier_coverage_10/centered_abs_mean": 0.12825309236844382,
|
|
"signal/frontier_coverage_10/group_std_mean": 0.17192438741525015,
|
|
"signal/frontier_coverage_10/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_10/scaled_weighted_centered_abs_mean": 0.0020039545682569346,
|
|
"signal/frontier_coverage_10/weight": 0.015625,
|
|
"signal/frontier_coverage_10/weighted_centered_abs_mean": 0.0020039545682569346,
|
|
"signal/frontier_coverage_15/centered_abs_mean": 0.07049262523651123,
|
|
"signal/frontier_coverage_15/group_std_mean": 0.09754702945550282,
|
|
"signal/frontier_coverage_15/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_15/scaled_weighted_centered_abs_mean": 0.001101447269320488,
|
|
"signal/frontier_coverage_15/weight": 0.015625,
|
|
"signal/frontier_coverage_15/weighted_centered_abs_mean": 0.001101447269320488,
|
|
"signal/frontier_coverage_20/centered_abs_mean": 0.04566365604599317,
|
|
"signal/frontier_coverage_20/group_std_mean": 0.06064340099692345,
|
|
"signal/frontier_coverage_20/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_20/scaled_weighted_centered_abs_mean": 0.0007134946257186433,
|
|
"signal/frontier_coverage_20/weight": 0.015625,
|
|
"signal/frontier_coverage_20/weighted_centered_abs_mean": 0.0007134946257186433,
|
|
"signal/frontier_coverage_25/centered_abs_mean": 0.05877576395869255,
|
|
"signal/frontier_coverage_25/group_std_mean": 0.07547732442617416,
|
|
"signal/frontier_coverage_25/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_25/scaled_weighted_centered_abs_mean": 0.0009183713118545711,
|
|
"signal/frontier_coverage_25/weight": 0.015625,
|
|
"signal/frontier_coverage_25/weighted_centered_abs_mean": 0.0009183713118545711,
|
|
"signal/frontier_coverage_5/centered_abs_mean": 0.1345667690038681,
|
|
"signal/frontier_coverage_5/group_std_mean": 0.18024377524852753,
|
|
"signal/frontier_coverage_5/group_zero_std_frac": 0.0,
|
|
"signal/frontier_coverage_5/scaled_weighted_centered_abs_mean": 0.002102605765685439,
|
|
"signal/frontier_coverage_5/weight": 0.015625,
|
|
"signal/frontier_coverage_5/weighted_centered_abs_mean": 0.002102605765685439,
|
|
"signal/true_frontier_ece_gap_only_reward/centered_abs_mean": 0.0035362955338011184,
|
|
"signal/true_frontier_ece_gap_only_reward/group_std_mean": 0.0049490658566355705,
|
|
"signal/true_frontier_ece_gap_only_reward/group_zero_std_frac": 0.0,
|
|
"signal/true_frontier_ece_gap_only_reward/scaled_weighted_centered_abs_mean": 0.0004420369417251398,
|
|
"signal/true_frontier_ece_gap_only_reward/weight": 0.125,
|
|
"signal/true_frontier_ece_gap_only_reward/weighted_centered_abs_mean": 0.0004420369417251398,
|
|
"step": 208,
|
|
"total_flos": 0.0,
|
|
"train_loss": -0.00872318486038309,
|
|
"train_runtime": 40755.4175,
|
|
"train_samples_per_second": 0.368,
|
|
"train_steps_per_second": 0.005
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 208,
|
|
"num_input_tokens_seen": 446538119,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 60,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 6,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|