Files
RLCR-v4-ks-uniqueness-cov0-…/trainer_state.json
ModelHub XC 7f9173d4e8 初始化项目,由ModelHub XC社区提供模型
Model: hector-gr/RLCR-v4-ks-uniqueness-cov0-entropy100-noece-noaurc-scaletrue-batchcov0only-cold-math
Source: Original Platform
2026-05-09 19:39:15 +08:00

4442 lines
260 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.49919376007799904,
"eval_steps": 50,
"global_step": 208,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calibration/aurc": 0.5174302011976861,
"calibration/batch_distribution_entropy": 0.27968643045408775,
"calibration/batch_entropy_100bins": 0.3491582241290684,
"calibration/batch_entropy_10bins": 0.27968643045408775,
"calibration/batch_entropy_50bins": 0.408646715708448,
"calibration/batch_uniqueness": 0.5034224970546688,
"calibration/confidence_entropy": 0.222429721256218,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/distribution_entropy_10": 0.27968643045408775,
"calibration/distribution_entropy_100": 0.3491582241290684,
"calibration/ece": 0.47095527672812854,
"calibration/mean_confidence": 0.9153333731605595,
"calibration/unique_confidence_per_question": 0.03072916666666667,
"calibration/unique_confidences": 11.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.019704861111111138,
"completions/max_length": 4029.6,
"completions/max_terminated_length": 4029.6,
"completions/mean_length": 513.3373291015625,
"completions/mean_terminated_length": 523.6432006835937,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011999850001874977,
"grad_norm": 0.009435656480491161,
"learning_rate": 5.952380952380953e-07,
"loss": 0.0081,
"num_tokens": 9027854.0,
"reward": 0.434605211019516,
"reward_std": 0.3864780366420746,
"rewards/accuracy_reward": 0.26076388657093047,
"rewards/batch_coverage_0": 0.0075095250271260735,
"rewards/brier_reward": 0.3133890748023987,
"rewards/confidence_uniqueness_reward": 0.2916687369346619,
"rewards/format_reward": 0.6003472089767456,
"rewards/frontier_entropy_batch_reward": -0.5720709562301636,
"signal/accuracy_reward/centered_abs_mean": 0.3107530415058136,
"signal/accuracy_reward/group_std_mean": 0.36877918243408203,
"signal/accuracy_reward/group_zero_std_frac": 0.09722222313284874,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.4251779973506927,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.1553765207529068,
"signal/advantage_abs_mean": 0.8601556777954101,
"signal/advantage_pre_scale_abs_mean": 0.334391725063324,
"signal/advantage_pre_scale_std": 0.3892968833446503,
"signal/advantage_std": 0.9841951966285706,
"signal/batch_coverage_0/centered_abs_mean": 0.01619186196476221,
"signal/batch_coverage_0/group_std_mean": 0.03339236527681351,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.004439523350447417,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.001619186159223318,
"signal/brier_reward/centered_abs_mean": 0.32154104113578796,
"signal/brier_reward/group_std_mean": 0.37417513132095337,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08799342513084411,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.032154104113578795,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.2379334509372711,
"signal/confidence_uniqueness_reward/group_std_mean": 0.28854405879974365,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06512594521045685,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.023793345689773558,
"signal/format_reward/centered_abs_mean": 0.4406900942325592,
"signal/format_reward/group_std_mean": 0.47528126239776614,
"signal/format_reward/group_zero_std_frac": 0.0,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.603149700164795,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.2203450471162796,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.45061137676239016,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4829135715961456,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.123350390791893,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.04506114050745964,
"step": 5
},
{
"calibration/aurc": 0.538458366076803,
"calibration/batch_distribution_entropy": 0.27025803806008913,
"calibration/batch_entropy_100bins": 0.35320084312111905,
"calibration/batch_entropy_10bins": 0.27025803806008913,
"calibration/batch_entropy_50bins": 0.40764678867964327,
"calibration/batch_uniqueness": 0.5098736001992761,
"calibration/confidence_entropy": 0.2274931285048082,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/distribution_entropy_10": 0.27025803806008913,
"calibration/distribution_entropy_100": 0.35320084312111905,
"calibration/ece": 0.469860126629539,
"calibration/mean_confidence": 0.9163531404806801,
"calibration/unique_confidence_per_question": 0.034895833333333334,
"calibration/unique_confidences": 13.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017013888888888884,
"completions/max_length": 4031.8,
"completions/max_terminated_length": 4031.8,
"completions/mean_length": 475.27578125,
"completions/mean_terminated_length": 483.72108154296876,
"completions/min_length": 0.0,
"completions/min_terminated_length": 23.4,
"epoch": 0.023999700003749954,
"grad_norm": 0.007094210479408503,
"learning_rate": 1.1904761904761906e-06,
"loss": -0.0012,
"num_tokens": 17585751.0,
"reward": 0.5200118303298951,
"reward_std": 0.35560383200645446,
"rewards/accuracy_reward": 0.29826388955116273,
"rewards/batch_coverage_0": 0.008394665271043777,
"rewards/brier_reward": 0.36398414969444276,
"rewards/confidence_uniqueness_reward": 0.37082897424697875,
"rewards/format_reward": 0.7326388716697693,
"rewards/frontier_entropy_batch_reward": -0.6976032853126526,
"signal/accuracy_reward/centered_abs_mean": 0.32623698115348815,
"signal/accuracy_reward/group_std_mean": 0.38321188688278196,
"signal/accuracy_reward/group_zero_std_frac": 0.0777777798473835,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.5054798066616059,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.16311849057674407,
"signal/advantage_abs_mean": 0.8196688890457153,
"signal/advantage_pre_scale_abs_mean": 0.2959678113460541,
"signal/advantage_pre_scale_std": 0.35853559374809263,
"signal/advantage_std": 0.9841603994369507,
"signal/batch_coverage_0/centered_abs_mean": 0.016383717581629754,
"signal/batch_coverage_0/group_std_mean": 0.03455512970685959,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.005117970705032349,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0016383717767894268,
"signal/brier_reward/centered_abs_mean": 0.3206613898277283,
"signal/brier_reward/group_std_mean": 0.3725072264671326,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0991722971200943,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.032066140323877335,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.21787003576755523,
"signal/confidence_uniqueness_reward/group_std_mean": 0.27440804839134214,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06710911989212036,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.021787003427743912,
"signal/format_reward/centered_abs_mean": 0.33837890625,
"signal/format_reward/group_std_mean": 0.4072873592376709,
"signal/format_reward/group_zero_std_frac": 0.008333333395421505,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.5145779550075531,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.169189453125,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3695851922035217,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.43336753249168397,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.00555555559694767,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.11289727687835693,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.036958518996834756,
"step": 10
},
{
"calibration/aurc": 0.5306261075067018,
"calibration/batch_distribution_entropy": 0.32752820170291713,
"calibration/batch_entropy_100bins": 0.37619434130013085,
"calibration/batch_entropy_10bins": 0.32752820170291713,
"calibration/batch_entropy_50bins": 0.43504399957185147,
"calibration/batch_uniqueness": 0.560448811522282,
"calibration/confidence_entropy": 0.26276109891710886,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/distribution_entropy_10": 0.32752820170291713,
"calibration/distribution_entropy_100": 0.37619434130013085,
"calibration/ece": 0.481128862912443,
"calibration/mean_confidence": 0.902030702661782,
"calibration/unique_confidence_per_question": 0.0375,
"calibration/unique_confidences": 14.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009114583333333325,
"completions/max_length": 3973.0,
"completions/max_terminated_length": 3973.0,
"completions/mean_length": 421.3002563476563,
"completions/mean_terminated_length": 425.16774291992186,
"completions/min_length": 0.0,
"completions/min_terminated_length": 50.6,
"epoch": 0.03599955000562493,
"grad_norm": 0.00401267409324646,
"learning_rate": 1.7857142857142859e-06,
"loss": -0.0301,
"num_tokens": 25541114.0,
"reward": 0.6512134194374084,
"reward_std": 0.2598669737577438,
"rewards/accuracy_reward": 0.3289930522441864,
"rewards/batch_coverage_0": 0.010935892909765243,
"rewards/brier_reward": 0.4457497835159302,
"rewards/confidence_uniqueness_reward": 0.5453853368759155,
"rewards/format_reward": 0.9550347328186035,
"rewards/frontier_entropy_batch_reward": -0.9100759267807007,
"signal/accuracy_reward/centered_abs_mean": 0.32265625,
"signal/accuracy_reward/group_std_mean": 0.38412655591964723,
"signal/accuracy_reward/group_zero_std_frac": 0.06944444701075554,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.7317090034484863,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.161328125,
"signal/advantage_abs_mean": 0.7629533171653747,
"signal/advantage_pre_scale_abs_mean": 0.20725361704826356,
"signal/advantage_pre_scale_std": 0.2652052193880081,
"signal/advantage_std": 0.9840136289596557,
"signal/batch_coverage_0/centered_abs_mean": 0.02262604646384716,
"signal/batch_coverage_0/group_std_mean": 0.042140249907970426,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.010491053014993668,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022626047022640703,
"signal/brier_reward/centered_abs_mean": 0.2939529836177826,
"signal/brier_reward/group_std_mean": 0.3465470314025879,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1333274409174919,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.029395297914743424,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1786189019680023,
"signal/confidence_uniqueness_reward/group_std_mean": 0.2248939424753189,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.08192214220762253,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.01786189079284668,
"signal/format_reward/centered_abs_mean": 0.08039279617369174,
"signal/format_reward/group_std_mean": 0.1518175147473812,
"signal/format_reward/group_zero_std_frac": 0.3999999947845936,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.1705910697579384,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.04019639808684587,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.1546709954738617,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.26556792855262756,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.12500000409781933,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.06849853545427323,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.015467099472880363,
"step": 15
},
{
"calibration/aurc": 0.431346821288402,
"calibration/batch_distribution_entropy": 0.5426059445625475,
"calibration/batch_entropy_100bins": 0.45494946806997305,
"calibration/batch_entropy_10bins": 0.5426059445625475,
"calibration/batch_entropy_50bins": 0.5300555115048834,
"calibration/batch_uniqueness": 0.6894089034323752,
"calibration/buffer_distribution_entropy": 0.3539083886858205,
"calibration/buffer_entropy_100bins": 0.39710971496026076,
"calibration/buffer_entropy_10bins": 0.3539083886858205,
"calibration/buffer_entropy_50bins": 0.4590901769775059,
"calibration/confidence_entropy": 0.37666589181304533,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.013648293963254593,
"calibration/coverage@25%": 0.09658792650918635,
"calibration/coverage@30%": 0.14482142611754684,
"calibration/coverage@5%": 0.0,
"calibration/distribution_entropy_10": 0.5426059445625475,
"calibration/distribution_entropy_100": 0.45494946806997305,
"calibration/ece": 0.3324109675010253,
"calibration/mean_confidence": 0.842016994994642,
"calibration/unique_confidence_per_question": 0.04895833333333333,
"calibration/unique_confidences": 18.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010069444444444442,
"completions/max_length": 3539.4,
"completions/max_terminated_length": 3539.4,
"completions/mean_length": 462.4025146484375,
"completions/mean_terminated_length": 467.1513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.8,
"epoch": 0.04799940000749991,
"grad_norm": 0.003982796333730221,
"learning_rate": 2.380952380952381e-06,
"loss": -0.0258,
"num_tokens": 33981687.0,
"reward": 0.7565704345703125,
"reward_std": 0.21907421946525574,
"rewards/accuracy_reward": 0.45008680820465086,
"rewards/batch_coverage_0": 0.04114052355289459,
"rewards/brier_reward": 0.5963123440742493,
"rewards/confidence_uniqueness_reward": 0.6829663753509522,
"rewards/format_reward": 0.9855902791023254,
"rewards/frontier_entropy_batch_reward": -0.9331006526947021,
"signal/accuracy_reward/centered_abs_mean": 0.290185546875,
"signal/accuracy_reward/group_std_mean": 0.3576755583286285,
"signal/accuracy_reward/group_zero_std_frac": 0.08055555745959282,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9244364380836487,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.1450927734375,
"signal/advantage_abs_mean": 0.7477807521820068,
"signal/advantage_pre_scale_abs_mean": 0.17386467158794403,
"signal/advantage_pre_scale_std": 0.22889118492603303,
"signal/advantage_std": 0.9838362455368042,
"signal/batch_coverage_0/centered_abs_mean": 0.04352298155426979,
"signal/batch_coverage_0/group_std_mean": 0.06958689764142037,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.028244782984256745,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.004352298146113753,
"signal/brier_reward/centered_abs_mean": 0.23803514540195464,
"signal/brier_reward/group_std_mean": 0.2906170547008514,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15154335796833038,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.02380351461470127,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.13516120612621307,
"signal/confidence_uniqueness_reward/group_std_mean": 0.16643594652414323,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.08516475707292556,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.013516120798885822,
"signal/format_reward/centered_abs_mean": 0.02657335102558136,
"signal/format_reward/group_std_mean": 0.05870797857642174,
"signal/format_reward/group_zero_std_frac": 0.7305555701255798,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.08531446680426598,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01328667551279068,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.11649189740419388,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.2214953511953354,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.24166666865348815,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.07496421933174133,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.011649190448224545,
"step": 20
},
{
"calibration/aurc": 0.35370092548420196,
"calibration/batch_distribution_entropy": 0.7403816854106197,
"calibration/batch_entropy_100bins": 0.6544776213166862,
"calibration/batch_entropy_10bins": 0.7403816854106197,
"calibration/batch_entropy_50bins": 0.7077410795972892,
"calibration/batch_uniqueness": 0.8446109592830314,
"calibration/buffer_distribution_entropy": 0.4704443038922532,
"calibration/buffer_entropy_100bins": 0.45799401070372914,
"calibration/buffer_entropy_10bins": 0.4704443038922532,
"calibration/buffer_entropy_50bins": 0.5275006311693058,
"calibration/confidence_entropy": 0.5280357127233377,
"calibration/coverage@0%": 0.003674540682414698,
"calibration/coverage@1%": 0.003674540682414698,
"calibration/coverage@10%": 0.013746021109063495,
"calibration/coverage@15%": 0.05981931953838286,
"calibration/coverage@20%": 0.08077959217446043,
"calibration/coverage@25%": 0.21680172311625281,
"calibration/coverage@30%": 0.4438294080768707,
"calibration/coverage@5%": 0.003674540682414698,
"calibration/distribution_entropy_10": 0.7403816854106197,
"calibration/distribution_entropy_100": 0.6544776213166862,
"calibration/ece": 0.16746795065959824,
"calibration/mean_confidence": 0.7183618971817334,
"calibration/unique_confidence_per_question": 0.1578125,
"calibration/unique_confidences": 60.6,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01015625,
"completions/max_length": 3957.0,
"completions/max_terminated_length": 3957.0,
"completions/mean_length": 515.8568664550781,
"completions/mean_terminated_length": 521.1552551269531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.6,
"epoch": 0.05999925000937488,
"grad_norm": 0.0032299933955073357,
"learning_rate": 2.9761904761904763e-06,
"loss": -0.021,
"num_tokens": 43048806.0,
"reward": 0.8601099848747253,
"reward_std": 0.1965584486722946,
"rewards/accuracy_reward": 0.5633680582046509,
"rewards/batch_coverage_0": 0.07260396629571915,
"rewards/brier_reward": 0.7115841269493103,
"rewards/confidence_uniqueness_reward": 0.8290335893630981,
"rewards/format_reward": 0.9865451455116272,
"rewards/frontier_entropy_batch_reward": -0.7616880893707275,
"signal/accuracy_reward/centered_abs_mean": 0.25604383945465087,
"signal/accuracy_reward/group_std_mean": 0.3211396872997284,
"signal/accuracy_reward/group_zero_std_frac": 0.1555555582046509,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8837929010391236,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.12802191972732543,
"signal/advantage_abs_mean": 0.7358854413032532,
"signal/advantage_pre_scale_abs_mean": 0.1512439429759979,
"signal/advantage_pre_scale_std": 0.20898248255252838,
"signal/advantage_std": 0.983783233165741,
"signal/batch_coverage_0/centered_abs_mean": 0.0846152812242508,
"signal/batch_coverage_0/group_std_mean": 0.1144055426120758,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.058388042449951175,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00846152864396572,
"signal/brier_reward/centered_abs_mean": 0.16035446226596833,
"signal/brier_reward/group_std_mean": 0.20569514036178588,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.11123347133398057,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016035445593297483,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.09056732803583145,
"signal/confidence_uniqueness_reward/group_std_mean": 0.12100362330675125,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.062135671824216844,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.009056732803583146,
"signal/format_reward/centered_abs_mean": 0.02431098110973835,
"signal/format_reward/group_std_mean": 0.049627327173948285,
"signal/format_reward/group_zero_std_frac": 0.7861111164093018,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.08382118195295334,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.012155490554869175,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3184548020362854,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4208760499954224,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.033333333767950536,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.21904338300228118,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03184547908604145,
"step": 25
},
{
"calibration/aurc": 0.2839966804132776,
"calibration/batch_distribution_entropy": 0.9605051655041095,
"calibration/batch_entropy_100bins": 0.9429200926612158,
"calibration/batch_entropy_10bins": 0.9605051655041095,
"calibration/batch_entropy_50bins": 0.9610896445243384,
"calibration/batch_uniqueness": 0.9511843537092707,
"calibration/buffer_distribution_entropy": 0.6156539225956406,
"calibration/buffer_entropy_100bins": 0.601263494323698,
"calibration/buffer_entropy_10bins": 0.6156539225956406,
"calibration/buffer_entropy_50bins": 0.6609181402206332,
"calibration/confidence_entropy": 0.5267294948163934,
"calibration/coverage@0%": 0.006397462349291899,
"calibration/coverage@1%": 0.006397462349291899,
"calibration/coverage@10%": 0.006397462349291899,
"calibration/coverage@15%": 0.023509761814532543,
"calibration/coverage@20%": 0.10798261586113189,
"calibration/coverage@25%": 0.34141912353301845,
"calibration/coverage@30%": 0.822169028871391,
"calibration/coverage@5%": 0.006397462349291899,
"calibration/distribution_entropy_10": 0.9605051655041095,
"calibration/distribution_entropy_100": 0.9429200926612158,
"calibration/ece": 0.22083162645645044,
"calibration/mean_confidence": 0.5505724597008637,
"calibration/unique_confidence_per_question": 0.8807291666666666,
"calibration/unique_confidences": 338.2,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01605902777777777,
"completions/max_length": 3942.2,
"completions/max_terminated_length": 3942.2,
"completions/mean_length": 583.0853393554687,
"completions/mean_terminated_length": 592.7757568359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.6,
"epoch": 0.07199910001124986,
"grad_norm": 0.002717937109991908,
"learning_rate": 3.5714285714285718e-06,
"loss": -0.0357,
"num_tokens": 52875869.0,
"reward": 0.9348342657089234,
"reward_std": 0.18317421972751619,
"rewards/accuracy_reward": 0.5981770753860474,
"rewards/batch_coverage_0": 0.10351773351430893,
"rewards/brier_reward": 0.7025912404060364,
"rewards/confidence_uniqueness_reward": 0.9337292194366456,
"rewards/format_reward": 0.9822048664093017,
"rewards/frontier_entropy_batch_reward": -0.293405282497406,
"signal/accuracy_reward/centered_abs_mean": 0.23839518129825593,
"signal/accuracy_reward/group_std_mean": 0.2983840018510818,
"signal/accuracy_reward/group_zero_std_frac": 0.21111111491918563,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.869914174079895,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.11919759064912797,
"signal/advantage_abs_mean": 0.7542917847633361,
"signal/advantage_pre_scale_abs_mean": 0.14086539298295975,
"signal/advantage_pre_scale_std": 0.19977498352527617,
"signal/advantage_std": 0.983738362789154,
"signal/batch_coverage_0/centered_abs_mean": 0.19907444715499878,
"signal/batch_coverage_0/group_std_mean": 0.26103257536888125,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.14895964413881302,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.019907445646822453,
"signal/brier_reward/centered_abs_mean": 0.21252276003360748,
"signal/brier_reward/group_std_mean": 0.2615444421768188,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15797891467809677,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.02125227525830269,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04080349802970886,
"signal/confidence_uniqueness_reward/group_std_mean": 0.06831279695034027,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.030214974656701088,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004080350138247013,
"signal/format_reward/centered_abs_mean": 0.02994249165058136,
"signal/format_reward/group_std_mean": 0.055678685754537584,
"signal/format_reward/group_zero_std_frac": 0.7722222208976746,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.11192618533968926,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01497124582529068,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3531898558139801,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4236706256866455,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.2580131709575653,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03531898595392704,
"step": 30
},
{
"calibration/aurc": 0.22170318832234343,
"calibration/batch_distribution_entropy": 0.9421379128373853,
"calibration/batch_entropy_100bins": 0.939229582486355,
"calibration/batch_entropy_10bins": 0.9421379128373853,
"calibration/batch_entropy_50bins": 0.9489073127515452,
"calibration/batch_uniqueness": 0.9435042531493842,
"calibration/buffer_distribution_entropy": 0.7322581373218676,
"calibration/buffer_entropy_100bins": 0.7159217758015332,
"calibration/buffer_entropy_10bins": 0.7322581373218676,
"calibration/buffer_entropy_50bins": 0.7623503142731999,
"calibration/confidence_entropy": 0.45037526062650085,
"calibration/coverage@0%": 0.004776460128240555,
"calibration/coverage@1%": 0.004776460128240555,
"calibration/coverage@10%": 0.019253671924487202,
"calibration/coverage@15%": 0.1755596710110715,
"calibration/coverage@20%": 0.5139170791128755,
"calibration/coverage@25%": 0.7677293737551006,
"calibration/coverage@30%": 0.908045643554782,
"calibration/coverage@5%": 0.004776460128240555,
"calibration/distribution_entropy_10": 0.9421379128373853,
"calibration/distribution_entropy_100": 0.939229582486355,
"calibration/ece": 0.1732557673693993,
"calibration/mean_confidence": 0.6251879703657945,
"calibration/unique_confidence_per_question": 0.9864583333333334,
"calibration/unique_confidences": 378.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021788194444444464,
"completions/max_length": 3869.6,
"completions/max_terminated_length": 3869.6,
"completions/mean_length": 676.70703125,
"completions/mean_terminated_length": 691.8087524414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.8,
"epoch": 0.08399895001312484,
"grad_norm": 0.00269313994795084,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0507,
"num_tokens": 63748974.0,
"reward": 0.9618130803108216,
"reward_std": 0.1690991222858429,
"rewards/accuracy_reward": 0.6490451335906983,
"rewards/batch_coverage_0": 0.1625123217701912,
"rewards/brier_reward": 0.7147271633148193,
"rewards/confidence_uniqueness_reward": 0.9211899161338806,
"rewards/format_reward": 0.9766493082046509,
"rewards/frontier_entropy_batch_reward": -0.30877079665660856,
"signal/accuracy_reward/centered_abs_mean": 0.19120551347732545,
"signal/accuracy_reward/group_std_mean": 0.2488324373960495,
"signal/accuracy_reward/group_zero_std_frac": 0.3027777761220932,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9168439865112304,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09560275673866273,
"signal/advantage_abs_mean": 0.736086618900299,
"signal/advantage_pre_scale_abs_mean": 0.12685162276029588,
"signal/advantage_pre_scale_std": 0.1916327953338623,
"signal/advantage_std": 0.9835097432136536,
"signal/batch_coverage_0/centered_abs_mean": 0.22149142622947693,
"signal/batch_coverage_0/group_std_mean": 0.2891678690910339,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21263793408870696,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.02214914225041866,
"signal/brier_reward/centered_abs_mean": 0.23408417999744416,
"signal/brier_reward/group_std_mean": 0.28517125248909,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.22420990467071533,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.023408417776226997,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.053552377969026566,
"signal/confidence_uniqueness_reward/group_std_mean": 0.08374377638101578,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05234308801591396,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0053552379831671715,
"signal/format_reward/centered_abs_mean": 0.03601888045668602,
"signal/format_reward/group_std_mean": 0.06319156214594841,
"signal/format_reward/group_zero_std_frac": 0.7527777791023255,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.17084376215934755,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01800944022834301,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.381756055355072,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.44897361397743224,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.37677479684352877,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0381756067276001,
"step": 35
},
{
"calibration/aurc": 0.24541412524387626,
"calibration/batch_distribution_entropy": 0.9383568457569865,
"calibration/batch_entropy_100bins": 0.9411353191318031,
"calibration/batch_entropy_10bins": 0.9383568457569865,
"calibration/batch_entropy_50bins": 0.9473751559506512,
"calibration/batch_uniqueness": 0.9451766886989332,
"calibration/buffer_distribution_entropy": 0.7766920853518905,
"calibration/buffer_entropy_100bins": 0.7748170257856153,
"calibration/buffer_entropy_10bins": 0.7766920853518905,
"calibration/buffer_entropy_50bins": 0.8095014447600581,
"calibration/confidence_entropy": 0.5105895803297336,
"calibration/coverage@0%": 0.011043239043669717,
"calibration/coverage@1%": 0.011043239043669717,
"calibration/coverage@10%": 0.03677228627460259,
"calibration/coverage@15%": 0.08397971397598639,
"calibration/coverage@20%": 0.33836452037838355,
"calibration/coverage@25%": 0.6420780927200319,
"calibration/coverage@30%": 0.8848675040846755,
"calibration/coverage@5%": 0.011043239043669717,
"calibration/distribution_entropy_10": 0.9383568457569865,
"calibration/distribution_entropy_100": 0.9411353191318031,
"calibration/ece": 0.15390421541967228,
"calibration/mean_confidence": 0.6241893393048994,
"calibration/unique_confidence_per_question": 0.9890625,
"calibration/unique_confidences": 379.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01883680555555558,
"completions/max_length": 3759.2,
"completions/max_terminated_length": 3759.2,
"completions/mean_length": 753.6845581054688,
"completions/mean_terminated_length": 768.2565063476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 226.2,
"epoch": 0.09599880001499982,
"grad_norm": 0.013629751279950142,
"learning_rate": 4.761904761904762e-06,
"loss": -0.0464,
"num_tokens": 75550940.0,
"reward": 0.97404465675354,
"reward_std": 0.15893589854240417,
"rewards/accuracy_reward": 0.6609375,
"rewards/batch_coverage_0": 0.19304660856723785,
"rewards/brier_reward": 0.7446058511734008,
"rewards/confidence_uniqueness_reward": 0.9269584774971008,
"rewards/format_reward": 0.9806423664093018,
"rewards/frontier_entropy_batch_reward": -0.3320638656616211,
"signal/accuracy_reward/centered_abs_mean": 0.17389322817325592,
"signal/accuracy_reward/group_std_mean": 0.23077381253242493,
"signal/accuracy_reward/group_zero_std_frac": 0.347222226858139,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9138347387313843,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08694661408662796,
"signal/advantage_abs_mean": 0.7391909837722779,
"signal/advantage_pre_scale_abs_mean": 0.11861821860074998,
"signal/advantage_pre_scale_std": 0.18562033176422119,
"signal/advantage_std": 0.9834268927574158,
"signal/batch_coverage_0/centered_abs_mean": 0.1839183449745178,
"signal/batch_coverage_0/group_std_mean": 0.24077706336975097,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.1939650923013687,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.018391834944486617,
"signal/brier_reward/centered_abs_mean": 0.19649418592453002,
"signal/brier_reward/group_std_mean": 0.24308202862739564,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20717605352401733,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.019649419561028482,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.045768487453460696,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07060145288705826,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04819247126579285,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004576848912984132,
"signal/format_reward/centered_abs_mean": 0.03188476637005806,
"signal/format_reward/group_std_mean": 0.05429861918091774,
"signal/format_reward/group_zero_std_frac": 0.7972222328186035,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.16629874408245088,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01594238318502903,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.38026103377342224,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.44629310369491576,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4036199152469635,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.038026104867458346,
"step": 40
},
{
"calibration/aurc": 0.17401281085281242,
"calibration/batch_distribution_entropy": 0.9257879515650167,
"calibration/batch_entropy_100bins": 0.9322180704729982,
"calibration/batch_entropy_10bins": 0.9257879515650167,
"calibration/batch_entropy_50bins": 0.9403918369687571,
"calibration/batch_uniqueness": 0.9413443750086088,
"calibration/buffer_distribution_entropy": 0.8118769211393018,
"calibration/buffer_entropy_100bins": 0.8163705193225421,
"calibration/buffer_entropy_10bins": 0.8118769211393018,
"calibration/buffer_entropy_50bins": 0.8437264714491857,
"calibration/confidence_entropy": 0.4751647145810791,
"calibration/coverage@0%": 0.018616960887388335,
"calibration/coverage@1%": 0.018616960887388335,
"calibration/coverage@10%": 0.11996558000338249,
"calibration/coverage@15%": 0.31641030253688285,
"calibration/coverage@20%": 0.8198981422435141,
"calibration/coverage@25%": 0.9679174333611573,
"calibration/coverage@30%": 1.0,
"calibration/coverage@5%": 0.04231194351809785,
"calibration/distribution_entropy_10": 0.9257879515650167,
"calibration/distribution_entropy_100": 0.9322180704729982,
"calibration/ece": 0.15314606802324127,
"calibration/mean_confidence": 0.6360751615599385,
"calibration/unique_confidence_per_question": 0.9807291666666667,
"calibration/unique_confidences": 376.6,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017361111111111095,
"completions/max_length": 3637.2,
"completions/max_terminated_length": 3637.2,
"completions/mean_length": 718.997998046875,
"completions/mean_terminated_length": 731.70673828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.1079986500168748,
"grad_norm": 0.003335554851219058,
"learning_rate": 4.909638554216868e-06,
"loss": -0.0454,
"num_tokens": 86969061.0,
"reward": 0.9880038857460022,
"reward_std": 0.15024617910385132,
"rewards/accuracy_reward": 0.6736979126930237,
"rewards/batch_coverage_0": 0.2329769492149353,
"rewards/brier_reward": 0.7637104630470276,
"rewards/confidence_uniqueness_reward": 0.9286281347274781,
"rewards/format_reward": 0.9825520753860474,
"rewards/frontier_entropy_batch_reward": -0.3265267163515091,
"signal/accuracy_reward/centered_abs_mean": 0.17314996123313903,
"signal/accuracy_reward/group_std_mean": 0.2271723985671997,
"signal/accuracy_reward/group_zero_std_frac": 0.3583333373069763,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9806798100471497,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08657498061656951,
"signal/advantage_abs_mean": 0.7340533494949341,
"signal/advantage_pre_scale_abs_mean": 0.11192792057991027,
"signal/advantage_pre_scale_std": 0.17584609091281891,
"signal/advantage_std": 0.9833369493484497,
"signal/batch_coverage_0/centered_abs_mean": 0.1993844449520111,
"signal/batch_coverage_0/group_std_mean": 0.25716363787651064,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2259293019771576,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0199384443461895,
"signal/brier_reward/centered_abs_mean": 0.18940032422542571,
"signal/brier_reward/group_std_mean": 0.23733858466148378,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.2157978594303131,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01894003227353096,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.041976536810398105,
"signal/confidence_uniqueness_reward/group_std_mean": 0.06569402515888215,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.048248696699738504,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004197653895244002,
"signal/format_reward/centered_abs_mean": 0.02760959230363369,
"signal/format_reward/group_std_mean": 0.048797968029975894,
"signal/format_reward/group_zero_std_frac": 0.8083333373069763,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.15639951974153518,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.013804796151816845,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.36036349534988404,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4273237884044647,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41718631982803345,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.036036350578069684,
"step": 45
},
{
"calibration/aurc": 0.3161069454398612,
"calibration/batch_distribution_entropy": 0.9644760827545266,
"calibration/batch_entropy_100bins": 0.9511358010751498,
"calibration/batch_entropy_10bins": 0.9644760827545266,
"calibration/batch_entropy_50bins": 0.9631218868935271,
"calibration/batch_uniqueness": 0.9489304890631937,
"calibration/buffer_distribution_entropy": 0.8392170070252319,
"calibration/buffer_entropy_100bins": 0.8461842860655169,
"calibration/buffer_entropy_10bins": 0.8392170070252319,
"calibration/buffer_entropy_50bins": 0.8688301669951886,
"calibration/confidence_entropy": 0.4740886147426032,
"calibration/coverage@0%": 0.008463280011736029,
"calibration/coverage@1%": 0.008463280011736029,
"calibration/coverage@10%": 0.0237447516429417,
"calibration/coverage@15%": 0.09319288994081405,
"calibration/coverage@20%": 0.2054546457599283,
"calibration/coverage@25%": 0.3819263744551913,
"calibration/coverage@30%": 0.534008588531292,
"calibration/coverage@5%": 0.008463280011736029,
"calibration/distribution_entropy_10": 0.9644760827545266,
"calibration/distribution_entropy_100": 0.9511358010751498,
"calibration/ece": 0.15545994179292094,
"calibration/mean_confidence": 0.5602707976117317,
"calibration/unique_confidence_per_question": 0.9921875,
"calibration/unique_confidences": 381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01206597222222221,
"completions/max_length": 3788.8,
"completions/max_terminated_length": 3788.8,
"completions/mean_length": 727.9027099609375,
"completions/mean_terminated_length": 736.8207885742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.8,
"epoch": 0.11999850001874976,
"grad_norm": 0.003200685838237405,
"learning_rate": 4.759036144578314e-06,
"loss": -0.0375,
"num_tokens": 98452100.0,
"reward": 0.9884833931922913,
"reward_std": 0.1402774930000305,
"rewards/accuracy_reward": 0.6598958373069763,
"rewards/batch_coverage_0": 0.2569040894508362,
"rewards/brier_reward": 0.7681125044822693,
"rewards/confidence_uniqueness_reward": 0.9333637595176697,
"rewards/format_reward": 0.9877604246139526,
"rewards/frontier_entropy_batch_reward": -0.3118279218673706,
"signal/accuracy_reward/centered_abs_mean": 0.15991753339767456,
"signal/accuracy_reward/group_std_mean": 0.21020196974277497,
"signal/accuracy_reward/group_zero_std_frac": 0.40277777910232543,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9491617441177368,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07995876669883728,
"signal/advantage_abs_mean": 0.7388450384140015,
"signal/advantage_pre_scale_abs_mean": 0.1035029023885727,
"signal/advantage_pre_scale_std": 0.16441345810890198,
"signal/advantage_std": 0.983297336101532,
"signal/batch_coverage_0/centered_abs_mean": 0.19529581367969512,
"signal/batch_coverage_0/group_std_mean": 0.24793221652507783,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23113827407360077,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01952958106994629,
"signal/brier_reward/centered_abs_mean": 0.17141424715518952,
"signal/brier_reward/group_std_mean": 0.21658135950565338,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.2038181960582733,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01714142579585314,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.036786551028490065,
"signal/confidence_uniqueness_reward/group_std_mean": 0.058603516221046446,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04392699599266052,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0036786553915590047,
"signal/format_reward/centered_abs_mean": 0.021164279617369174,
"signal/format_reward/group_std_mean": 0.04010897055268288,
"signal/format_reward/group_zero_std_frac": 0.8361111164093018,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.12460318654775619,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.010582139808684587,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3541642606258392,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.42405728101730344,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.42257370352745055,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03541642650961876,
"step": 50
},
{
"epoch": 0.11999850001874976,
"eval_completions/clipped_ratio": 0.010416666666666649,
"eval_completions/max_length": 2050.5,
"eval_completions/max_terminated_length": 2050.5,
"eval_completions/mean_length": 720.9482727050781,
"eval_completions/mean_terminated_length": 728.4109090169271,
"eval_completions/min_length": 96.16666666666667,
"eval_completions/min_terminated_length": 252.5,
"eval_loss": 0.0,
"eval_num_tokens": 98452100.0,
"eval_reward": 0.9005749424298605,
"eval_reward_std": 0.2392077917853991,
"eval_rewards/accuracy_reward": 0.6744791567325592,
"eval_rewards/batch_coverage_0": 0.008232661212484041,
"eval_rewards/brier_reward": 0.7813109854857127,
"eval_rewards/confidence_uniqueness_reward": 0.885476420323054,
"eval_rewards/format_reward": 0.9895833233992258,
"eval_rewards/frontier_entropy_batch_reward": -0.9895833233992258,
"eval_runtime": 179.5931,
"eval_samples_per_second": 5.568,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4273545990387599,
"eval_signal/accuracy_reward/group_std_mean": 0.4683712025483449,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9062869151433309,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21367729951937994,
"eval_signal/advantage_abs_mean": 0.8733128209908804,
"eval_signal/advantage_pre_scale_abs_mean": 0.2092428207397461,
"eval_signal/advantage_pre_scale_std": 0.23747060696283975,
"eval_signal/advantage_std": 0.9864048759142557,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.17604178686936697,
"eval_signal/batch_coverage_0/group_std_mean": 0.26295527070760727,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0748915175596873,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.017604179369906586,
"eval_signal/brier_reward/centered_abs_mean": 0.19752245396375656,
"eval_signal/brier_reward/group_std_mean": 0.2500632430116336,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08338230103254318,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.019752246638139088,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.05403412195543448,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.08773908950388432,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.022890682332217693,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005403412350763877,
"eval_signal/format_reward/centered_abs_mean": 0.019965277363856632,
"eval_signal/format_reward/group_std_mean": 0.05294674759109815,
"eval_signal/format_reward/group_zero_std_frac": 0.7222222288449606,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.04037875309586525,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.009982638681928316,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.019965277363856632,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.05294674759109815,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.7222222288449606,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.00807575105379025,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0019965277363856635,
"eval_steps_per_second": 0.033,
"step": 50
},
{
"epoch": 0.11999850001874976,
"step": 50,
"train_probe_completions/clipped_ratio": 0.015277777777777779,
"train_probe_completions/max_length": 2122.8333333333335,
"train_probe_completions/max_terminated_length": 2122.8333333333335,
"train_probe_completions/mean_length": 716.4770914713541,
"train_probe_completions/mean_terminated_length": 727.799326578776,
"train_probe_completions/min_length": 0.0,
"train_probe_completions/min_terminated_length": 242.83333333333334,
"train_probe_loss": 0.0,
"train_probe_num_tokens": 98452100.0,
"train_probe_reward": 0.9082557559013367,
"train_probe_reward_std": 0.2357780635356903,
"train_probe_rewards/accuracy_reward": 0.6814236044883728,
"train_probe_rewards/batch_coverage_0": 0.025458375923335552,
"train_probe_rewards/brier_reward": 0.8000073134899139,
"train_probe_rewards/confidence_uniqueness_reward": 0.8881678680578867,
"train_probe_rewards/format_reward": 0.9904513855775198,
"train_probe_rewards/frontier_entropy_batch_reward": -0.9904513855775198,
"train_probe_runtime": 206.2151,
"train_probe_samples_per_second": 4.849,
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.4191080729166667,
"train_probe_signal/accuracy_reward/group_std_mean": 0.4634708563486735,
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9035188257694244,
"train_probe_signal/accuracy_reward/weight": 0.5,
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.20955403645833334,
"train_probe_signal/advantage_abs_mean": 0.8659235934416453,
"train_probe_signal/advantage_pre_scale_abs_mean": 0.20504274467627207,
"train_probe_signal/advantage_pre_scale_std": 0.23454017440478006,
"train_probe_signal/advantage_std": 0.9863985180854797,
"train_probe_signal/batch_coverage_0/centered_abs_mean": 0.16327512512604395,
"train_probe_signal/batch_coverage_0/group_std_mean": 0.23903479675451914,
"train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07043657451868057,
"train_probe_signal/batch_coverage_0/weight": 0.10000000149011612,
"train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.016327512450516224,
"train_probe_signal/brier_reward/centered_abs_mean": 0.17697453250487646,
"train_probe_signal/brier_reward/group_std_mean": 0.23091630637645721,
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07631279900670052,
"train_probe_signal/brier_reward/weight": 0.10000000149011612,
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.017697453498840332,
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.049318368236223854,
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.08279193627337615,
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02125573654969533,
"train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004931837009886901,
"train_probe_signal/format_reward/centered_abs_mean": 0.018500433613856632,
"train_probe_signal/format_reward/group_std_mean": 0.054015101244052253,
"train_probe_signal/format_reward/group_zero_std_frac": 0.6944444676240286,
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.03978658188134432,
"train_probe_signal/format_reward/weight": 0.5,
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.009250216806928316,
"train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.018500433613856632,
"train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.054015101244052253,
"train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.6944444676240286,
"train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.007957316547011336,
"train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0018500435204866033,
"train_probe_steps_per_second": 0.029
},
{
"calibration/aurc": 0.22472360402010977,
"calibration/batch_distribution_entropy": 0.9609207169219642,
"calibration/batch_entropy_100bins": 0.9516821872459627,
"calibration/batch_entropy_10bins": 0.9609207169219642,
"calibration/batch_entropy_50bins": 0.9619270817134706,
"calibration/batch_uniqueness": 0.9474521596011586,
"calibration/buffer_distribution_entropy": 0.8653756366467846,
"calibration/buffer_entropy_100bins": 0.8726938653024712,
"calibration/buffer_entropy_10bins": 0.8653756366467846,
"calibration/buffer_entropy_50bins": 0.8914594965398323,
"calibration/confidence_entropy": 0.4753242399571679,
"calibration/coverage@0%": 0.023157928701659374,
"calibration/coverage@1%": 0.023157928701659374,
"calibration/coverage@10%": 0.20255356707408798,
"calibration/coverage@15%": 0.29319682499541944,
"calibration/coverage@20%": 0.3988951974691966,
"calibration/coverage@25%": 0.6721432631962692,
"calibration/coverage@30%": 0.8863604042477233,
"calibration/coverage@5%": 0.13758263084949302,
"calibration/distribution_entropy_10": 0.9609207169219642,
"calibration/distribution_entropy_100": 0.9516821872459627,
"calibration/ece": 0.16699372970477053,
"calibration/mean_confidence": 0.5923917872653834,
"calibration/unique_confidence_per_question": 0.9854166666666668,
"calibration/unique_confidences": 378.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010329861111111093,
"completions/max_length": 3465.4,
"completions/max_terminated_length": 3465.4,
"completions/mean_length": 736.5135498046875,
"completions/mean_terminated_length": 744.2725219726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.2,
"epoch": 0.13199835002062474,
"grad_norm": 0.004147018771618605,
"learning_rate": 4.60843373493976e-06,
"loss": -0.0263,
"num_tokens": 110017312.0,
"reward": 0.9964199542999268,
"reward_std": 0.13358933180570604,
"rewards/accuracy_reward": 0.6614583253860473,
"rewards/batch_coverage_0": 0.2666813760995865,
"rewards/brier_reward": 0.7828863620758056,
"rewards/confidence_uniqueness_reward": 0.9379994750022889,
"rewards/format_reward": 0.9893229007720947,
"rewards/frontier_entropy_batch_reward": -0.2772741973400116,
"signal/accuracy_reward/centered_abs_mean": 0.1580186665058136,
"signal/accuracy_reward/group_std_mean": 0.20960874259471893,
"signal/accuracy_reward/group_zero_std_frac": 0.38888888955116274,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.97646164894104,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0790093332529068,
"signal/advantage_abs_mean": 0.7356102347373963,
"signal/advantage_pre_scale_abs_mean": 0.09942354112863541,
"signal/advantage_pre_scale_std": 0.1567298650741577,
"signal/advantage_std": 0.9832465767860412,
"signal/batch_coverage_0/centered_abs_mean": 0.18050281703472137,
"signal/batch_coverage_0/group_std_mean": 0.2307935357093811,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22286758720874786,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01805028095841408,
"signal/brier_reward/centered_abs_mean": 0.1542533814907074,
"signal/brier_reward/group_std_mean": 0.198762246966362,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19102311730384827,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01542533803731203,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.030431923642754554,
"signal/confidence_uniqueness_reward/group_std_mean": 0.04879971742630005,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03733023479580879,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003043192345649004,
"signal/format_reward/centered_abs_mean": 0.016975911520421504,
"signal/format_reward/group_std_mean": 0.03276526555418968,
"signal/format_reward/group_zero_std_frac": 0.8611111164093017,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.10242971032857895,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.008487955760210752,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3267883062362671,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3983007729053497,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4050808668136597,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03267883062362671,
"step": 55
},
{
"calibration/aurc": 0.2827445677122261,
"calibration/batch_distribution_entropy": 0.9694165902358778,
"calibration/batch_entropy_100bins": 0.9543224931993513,
"calibration/batch_entropy_10bins": 0.9694165902358778,
"calibration/batch_entropy_50bins": 0.9665939385794944,
"calibration/batch_uniqueness": 0.9510640820618093,
"calibration/buffer_distribution_entropy": 0.880743493705171,
"calibration/buffer_entropy_100bins": 0.8888775798420593,
"calibration/buffer_entropy_10bins": 0.880743493705171,
"calibration/buffer_entropy_50bins": 0.904910653236246,
"calibration/confidence_entropy": 0.4844323737164452,
"calibration/coverage@0%": 0.0135758135283677,
"calibration/coverage@1%": 0.0135758135283677,
"calibration/coverage@10%": 0.08098455606724095,
"calibration/coverage@15%": 0.3142387701904429,
"calibration/coverage@20%": 0.4361154606476519,
"calibration/coverage@25%": 0.5866153823361311,
"calibration/coverage@30%": 0.679600433190058,
"calibration/coverage@5%": 0.0135758135283677,
"calibration/distribution_entropy_10": 0.9694165902358778,
"calibration/distribution_entropy_100": 0.9543224931993513,
"calibration/ece": 0.14178054807969714,
"calibration/mean_confidence": 0.5263035042002124,
"calibration/unique_confidence_per_question": 0.9895833333333334,
"calibration/unique_confidences": 380.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012500000000000022,
"completions/max_length": 3425.8,
"completions/max_terminated_length": 3425.8,
"completions/mean_length": 742.4217041015625,
"completions/mean_terminated_length": 751.9814697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.2,
"epoch": 0.14399820002249972,
"grad_norm": 0.00289203436113894,
"learning_rate": 4.457831325301205e-06,
"loss": -0.0292,
"num_tokens": 121666586.0,
"reward": 0.9800794243812561,
"reward_std": 0.14277701675891877,
"rewards/accuracy_reward": 0.6308159708976746,
"rewards/batch_coverage_0": 0.2683397114276886,
"rewards/brier_reward": 0.7775330781936646,
"rewards/confidence_uniqueness_reward": 0.9373286366462708,
"rewards/format_reward": 0.9873263835906982,
"rewards/frontier_entropy_batch_reward": -0.27311920523643496,
"signal/accuracy_reward/centered_abs_mean": 0.174853515625,
"signal/accuracy_reward/group_std_mean": 0.226035276055336,
"signal/accuracy_reward/group_zero_std_frac": 0.37222222685813905,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0188154578208923,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0874267578125,
"signal/advantage_abs_mean": 0.7372269749641418,
"signal/advantage_pre_scale_abs_mean": 0.10599318891763687,
"signal/advantage_pre_scale_std": 0.1663038432598114,
"signal/advantage_std": 0.983307147026062,
"signal/batch_coverage_0/centered_abs_mean": 0.18373815715312958,
"signal/batch_coverage_0/group_std_mean": 0.23306281864643097,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21616858541965484,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.018373816460371017,
"signal/brier_reward/centered_abs_mean": 0.16176026165485383,
"signal/brier_reward/group_std_mean": 0.20582883358001708,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18924815058708191,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016176026687026022,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03418286256492138,
"signal/confidence_uniqueness_reward/group_std_mean": 0.05919753760099411,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03993520997464657,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003418286330997944,
"signal/format_reward/centered_abs_mean": 0.022428384982049465,
"signal/format_reward/group_std_mean": 0.04563566856086254,
"signal/format_reward/group_zero_std_frac": 0.8,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.12949552983045579,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.011214192491024733,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3305469274520874,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4020517349243164,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.38890537023544314,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033054694533348083,
"step": 60
},
{
"calibration/aurc": 0.17680634517317753,
"calibration/batch_distribution_entropy": 0.9223857930936965,
"calibration/batch_entropy_100bins": 0.9294423511504757,
"calibration/batch_entropy_10bins": 0.9223857930936965,
"calibration/batch_entropy_50bins": 0.9373286928503596,
"calibration/batch_uniqueness": 0.9415649447642684,
"calibration/buffer_distribution_entropy": 0.8967923017175998,
"calibration/buffer_entropy_100bins": 0.903105019101886,
"calibration/buffer_entropy_10bins": 0.8967923017175998,
"calibration/buffer_entropy_50bins": 0.9176593724373063,
"calibration/confidence_entropy": 0.4511241150397975,
"calibration/coverage@0%": 0.033093853390412764,
"calibration/coverage@1%": 0.033093853390412764,
"calibration/coverage@10%": 0.5010672240079564,
"calibration/coverage@15%": 0.5694402006428644,
"calibration/coverage@20%": 0.6407083346546165,
"calibration/coverage@25%": 0.7087965713901342,
"calibration/coverage@30%": 0.7420715799184168,
"calibration/coverage@5%": 0.3894755245494424,
"calibration/distribution_entropy_10": 0.9223857930936965,
"calibration/distribution_entropy_100": 0.9294423511504757,
"calibration/ece": 0.13352572398916868,
"calibration/mean_confidence": 0.6091362952833019,
"calibration/unique_confidence_per_question": 0.9739583333333334,
"calibration/unique_confidences": 374.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02065972222222221,
"completions/max_length": 2984.6,
"completions/max_terminated_length": 2984.6,
"completions/mean_length": 691.2902954101562,
"completions/mean_terminated_length": 705.7674682617187,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.8,
"epoch": 0.1559980500243747,
"grad_norm": 0.02314193919301033,
"learning_rate": 4.307228915662651e-06,
"loss": -0.0495,
"num_tokens": 132724298.0,
"reward": 0.9898407101631165,
"reward_std": 0.1416984274983406,
"rewards/accuracy_reward": 0.65703125,
"rewards/batch_coverage_0": 0.31068081855773927,
"rewards/brier_reward": 0.8045204639434814,
"rewards/confidence_uniqueness_reward": 0.9254239439964295,
"rewards/format_reward": 0.9787326455116272,
"rewards/frontier_entropy_batch_reward": -0.32103757858276366,
"signal/accuracy_reward/centered_abs_mean": 0.14537217915058137,
"signal/accuracy_reward/group_std_mean": 0.19845550060272216,
"signal/accuracy_reward/group_zero_std_frac": 0.40833333134651184,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9093841433525085,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07268608957529069,
"signal/advantage_abs_mean": 0.7226288199424744,
"signal/advantage_pre_scale_abs_mean": 0.10107557326555253,
"signal/advantage_pre_scale_std": 0.171230810880661,
"signal/advantage_std": 0.983215045928955,
"signal/batch_coverage_0/centered_abs_mean": 0.17129134237766266,
"signal/batch_coverage_0/group_std_mean": 0.21862670183181762,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2162470906972885,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017129134200513364,
"signal/brier_reward/centered_abs_mean": 0.14572922885417938,
"signal/brier_reward/group_std_mean": 0.1897602289915085,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1829657733440399,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.014572923444211483,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04564875364303589,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07703709006309509,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.057298554480075835,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004564875271171331,
"signal/format_reward/centered_abs_mean": 0.03393554724752903,
"signal/format_reward/group_std_mean": 0.06373440623283386,
"signal/format_reward/group_zero_std_frac": 0.7361111164093017,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.21068618595600128,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.016967773623764514,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34571743607521055,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4146285653114319,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43939276933670046,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03457174450159073,
"step": 65
},
{
"calibration/aurc": 0.255712961146632,
"calibration/batch_distribution_entropy": 0.9388658306441016,
"calibration/batch_entropy_100bins": 0.9374460761801705,
"calibration/batch_entropy_10bins": 0.9388658306441016,
"calibration/batch_entropy_50bins": 0.9461765391704772,
"calibration/batch_uniqueness": 0.943961165940431,
"calibration/buffer_distribution_entropy": 0.9027612944060565,
"calibration/buffer_entropy_100bins": 0.9118606001305496,
"calibration/buffer_entropy_10bins": 0.9027612944060565,
"calibration/buffer_entropy_50bins": 0.9241160182250978,
"calibration/confidence_entropy": 0.441581652647624,
"calibration/coverage@0%": 0.09105444880082295,
"calibration/coverage@1%": 0.09606837637742462,
"calibration/coverage@10%": 0.20810418588988497,
"calibration/coverage@15%": 0.25194888797714865,
"calibration/coverage@20%": 0.2779983592606131,
"calibration/coverage@25%": 0.31211272924258926,
"calibration/coverage@30%": 0.6796830076800402,
"calibration/coverage@5%": 0.13181085772043027,
"calibration/distribution_entropy_10": 0.9388658306441016,
"calibration/distribution_entropy_100": 0.9374460761801705,
"calibration/ece": 0.18058968785492258,
"calibration/mean_confidence": 0.5582756787869785,
"calibration/unique_confidence_per_question": 0.9671875,
"calibration/unique_confidences": 371.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02960069444444442,
"completions/max_length": 3201.8,
"completions/max_terminated_length": 3201.8,
"completions/mean_length": 635.1415893554688,
"completions/mean_terminated_length": 654.58779296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.4,
"epoch": 0.16799790002624967,
"grad_norm": 0.00906718336045742,
"learning_rate": 4.156626506024097e-06,
"loss": -0.0809,
"num_tokens": 143119273.0,
"reward": 0.9665384531021118,
"reward_std": 0.159322065114975,
"rewards/accuracy_reward": 0.6370659828186035,
"rewards/batch_coverage_0": 0.2889437019824982,
"rewards/brier_reward": 0.7667541742324829,
"rewards/confidence_uniqueness_reward": 0.9159990549087524,
"rewards/format_reward": 0.9701388835906982,
"rewards/frontier_entropy_batch_reward": -0.3423368394374847,
"signal/accuracy_reward/centered_abs_mean": 0.16377495527267455,
"signal/accuracy_reward/group_std_mean": 0.212760990858078,
"signal/accuracy_reward/group_zero_std_frac": 0.4083333373069763,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9339677453041076,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08188747763633727,
"signal/advantage_abs_mean": 0.7247249484062195,
"signal/advantage_pre_scale_abs_mean": 0.114870385825634,
"signal/advantage_pre_scale_std": 0.1892496645450592,
"signal/advantage_std": 0.9833415269851684,
"signal/batch_coverage_0/centered_abs_mean": 0.19291015863418579,
"signal/batch_coverage_0/group_std_mean": 0.2440529942512512,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21951412558555602,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.019291016831994058,
"signal/brier_reward/centered_abs_mean": 0.17383241057395935,
"signal/brier_reward/group_std_mean": 0.2209314674139023,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19668594300746917,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.017383241280913352,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.05868452787399292,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09797782599925994,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0654455192387104,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005868452694267034,
"signal/format_reward/centered_abs_mean": 0.04973958320915699,
"signal/format_reward/group_std_mean": 0.08830733895301819,
"signal/format_reward/group_zero_std_frac": 0.6583333373069763,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.2726438879966736,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.024869791604578494,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.36462308168411256,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.43168188333511354,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4166643261909485,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03646230846643448,
"step": 70
},
{
"calibration/aurc": 0.20402879477321081,
"calibration/batch_distribution_entropy": 0.9100570338201137,
"calibration/batch_entropy_100bins": 0.9255242664998121,
"calibration/batch_entropy_10bins": 0.9100570338201137,
"calibration/batch_entropy_50bins": 0.9331518001514901,
"calibration/batch_uniqueness": 0.9388446130619105,
"calibration/buffer_distribution_entropy": 0.9075425189572274,
"calibration/buffer_entropy_100bins": 0.9188536488152922,
"calibration/buffer_entropy_10bins": 0.9075425189572274,
"calibration/buffer_entropy_50bins": 0.9292881590319055,
"calibration/confidence_entropy": 0.4662604234655504,
"calibration/coverage@0%": 0.01183253152297172,
"calibration/coverage@1%": 0.01183253152297172,
"calibration/coverage@10%": 0.3507062010567685,
"calibration/coverage@15%": 0.4746198083784761,
"calibration/coverage@20%": 0.6568163427547684,
"calibration/coverage@25%": 0.7129386973791396,
"calibration/coverage@30%": 0.7461053848765358,
"calibration/coverage@5%": 0.06645065896718613,
"calibration/distribution_entropy_10": 0.9100570338201137,
"calibration/distribution_entropy_100": 0.9255242664998121,
"calibration/ece": 0.1262777985805592,
"calibration/mean_confidence": 0.6514417334269297,
"calibration/unique_confidence_per_question": 0.975,
"calibration/unique_confidences": 374.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023263888888888883,
"completions/max_length": 2466.4,
"completions/max_terminated_length": 2466.4,
"completions/mean_length": 585.1560791015625,
"completions/mean_terminated_length": 599.1790893554687,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.2,
"epoch": 0.17999775002812465,
"grad_norm": 0.004078112542629242,
"learning_rate": 4.006024096385543e-06,
"loss": -0.0624,
"num_tokens": 152925167.0,
"reward": 0.9990050196647644,
"reward_std": 0.15093266367912292,
"rewards/accuracy_reward": 0.6853298544883728,
"rewards/batch_coverage_0": 0.3030748426914215,
"rewards/brier_reward": 0.8090933322906494,
"rewards/confidence_uniqueness_reward": 0.9204216599464417,
"rewards/format_reward": 0.976475715637207,
"rewards/frontier_entropy_batch_reward": -0.3515673041343689,
"signal/accuracy_reward/centered_abs_mean": 0.1615180104970932,
"signal/accuracy_reward/group_std_mean": 0.21247020363807678,
"signal/accuracy_reward/group_zero_std_frac": 0.4,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0372991800308227,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0807590052485466,
"signal/advantage_abs_mean": 0.7307239651679993,
"signal/advantage_pre_scale_abs_mean": 0.10921901911497116,
"signal/advantage_pre_scale_std": 0.18343055248260498,
"signal/advantage_std": 0.9831865429878235,
"signal/batch_coverage_0/centered_abs_mean": 0.15994410514831542,
"signal/batch_coverage_0/group_std_mean": 0.20556128919124603,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20666089951992034,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015994411334395407,
"signal/brier_reward/centered_abs_mean": 0.14350559711456298,
"signal/brier_reward/group_std_mean": 0.18532647788524628,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18572898209095,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.014350558631122112,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.052418901771306994,
"signal/confidence_uniqueness_reward/group_std_mean": 0.0843198612332344,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06781294569373131,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005241890251636505,
"signal/format_reward/centered_abs_mean": 0.04034830741584301,
"signal/format_reward/group_std_mean": 0.07034000232815743,
"signal/format_reward/group_zero_std_frac": 0.7361111283302307,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.2581541657447815,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.020174153707921506,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3405683159828186,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40916064381599426,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.44408488273620605,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0340568333864212,
"step": 75
},
{
"calibration/aurc": 0.21312748209126955,
"calibration/batch_distribution_entropy": 0.9192534724128174,
"calibration/batch_entropy_100bins": 0.9296142793543695,
"calibration/batch_entropy_10bins": 0.9192534724128174,
"calibration/batch_entropy_50bins": 0.9355600534978465,
"calibration/batch_uniqueness": 0.9408131449770669,
"calibration/buffer_distribution_entropy": 0.9106141449919584,
"calibration/buffer_entropy_100bins": 0.9244551232284438,
"calibration/buffer_entropy_10bins": 0.9106141449919584,
"calibration/buffer_entropy_50bins": 0.9331678191582089,
"calibration/confidence_entropy": 0.4631196946712599,
"calibration/coverage@0%": 0.005760600891838782,
"calibration/coverage@1%": 0.005760600891838782,
"calibration/coverage@10%": 0.27219365801624273,
"calibration/coverage@15%": 0.3836405255871157,
"calibration/coverage@20%": 0.5819669751686798,
"calibration/coverage@25%": 0.7454837778785506,
"calibration/coverage@30%": 0.8588018973538969,
"calibration/coverage@5%": 0.0398236205012986,
"calibration/distribution_entropy_10": 0.9192534724128174,
"calibration/distribution_entropy_100": 0.9296142793543695,
"calibration/ece": 0.14439641011437374,
"calibration/mean_confidence": 0.6531200786070206,
"calibration/unique_confidence_per_question": 0.9916666666666666,
"calibration/unique_confidences": 380.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01571180555555556,
"completions/max_length": 3380.0,
"completions/max_terminated_length": 3380.0,
"completions/mean_length": 588.9006958007812,
"completions/mean_terminated_length": 598.3701049804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.19199760002999963,
"grad_norm": 0.00266805081628263,
"learning_rate": 3.855421686746989e-06,
"loss": -0.0537,
"num_tokens": 162762583.0,
"reward": 0.9848804593086242,
"reward_std": 0.14701928794384003,
"rewards/accuracy_reward": 0.6446180582046509,
"rewards/batch_coverage_0": 0.28264726996421813,
"rewards/brier_reward": 0.7860515952110291,
"rewards/confidence_uniqueness_reward": 0.9323421716690063,
"rewards/format_reward": 0.9842882037162781,
"rewards/frontier_entropy_batch_reward": -0.296767795085907,
"signal/accuracy_reward/centered_abs_mean": 0.16694878339767455,
"signal/accuracy_reward/group_std_mean": 0.21657846570014955,
"signal/accuracy_reward/group_zero_std_frac": 0.397222226858139,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9957749605178833,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08347439169883727,
"signal/advantage_abs_mean": 0.7289156675338745,
"signal/advantage_pre_scale_abs_mean": 0.10731721073389053,
"signal/advantage_pre_scale_std": 0.17323702573776245,
"signal/advantage_std": 0.983283269405365,
"signal/batch_coverage_0/centered_abs_mean": 0.16982728838920594,
"signal/batch_coverage_0/group_std_mean": 0.21664536595344544,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20365143120288848,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.016982728615403175,
"signal/brier_reward/centered_abs_mean": 0.1532373309135437,
"signal/brier_reward/group_std_mean": 0.1974636048078537,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18305174112319947,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.015323733352124691,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04000471830368042,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07147712409496307,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.047922450304031375,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004000471858307719,
"signal/format_reward/centered_abs_mean": 0.02828233428299427,
"signal/format_reward/group_std_mean": 0.058084848523139956,
"signal/format_reward/group_zero_std_frac": 0.7472222208976745,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.16909490078687667,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.014141167141497135,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34231672883033754,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.41321871280670164,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4113028526306152,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034231672435998915,
"step": 80
},
{
"calibration/aurc": 0.1632135076835743,
"calibration/batch_distribution_entropy": 0.9659063993196713,
"calibration/batch_entropy_100bins": 0.9536454574623983,
"calibration/batch_entropy_10bins": 0.9659063993196713,
"calibration/batch_entropy_50bins": 0.965706134473739,
"calibration/batch_uniqueness": 0.9492283081586897,
"calibration/buffer_distribution_entropy": 0.915495808049182,
"calibration/buffer_entropy_100bins": 0.9300554231101016,
"calibration/buffer_entropy_10bins": 0.915495808049182,
"calibration/buffer_entropy_50bins": 0.9376210976212376,
"calibration/confidence_entropy": 0.45173476267042323,
"calibration/coverage@0%": 0.01754124210834912,
"calibration/coverage@1%": 0.01754124210834912,
"calibration/coverage@10%": 0.3911290055639153,
"calibration/coverage@15%": 0.496067459083426,
"calibration/coverage@20%": 0.6235121405255534,
"calibration/coverage@25%": 0.7748134369680345,
"calibration/coverage@30%": 0.8836314921488823,
"calibration/coverage@5%": 0.25191154408155897,
"calibration/distribution_entropy_10": 0.9659063993196713,
"calibration/distribution_entropy_100": 0.9536454574623983,
"calibration/ece": 0.15774952942281292,
"calibration/mean_confidence": 0.526197596741875,
"calibration/unique_confidence_per_question": 0.9817708333333334,
"calibration/unique_confidences": 377.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.020746527777777767,
"completions/max_length": 3311.0,
"completions/max_terminated_length": 3311.0,
"completions/mean_length": 603.830908203125,
"completions/mean_terminated_length": 616.63349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.2039974500318746,
"grad_norm": 0.0040805041790008545,
"learning_rate": 3.7048192771084342e-06,
"loss": -0.0584,
"num_tokens": 172805915.0,
"reward": 1.0033825635910034,
"reward_std": 0.14983320534229277,
"rewards/accuracy_reward": 0.68515625,
"rewards/batch_coverage_0": 0.296441388130188,
"rewards/brier_reward": 0.7884108066558838,
"rewards/confidence_uniqueness_reward": 0.9267748475074769,
"rewards/format_reward": 0.978125,
"rewards/frontier_entropy_batch_reward": -0.2942079544067383,
"signal/accuracy_reward/centered_abs_mean": 0.160009765625,
"signal/accuracy_reward/group_std_mean": 0.2096061587333679,
"signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0137329697608948,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0800048828125,
"signal/advantage_abs_mean": 0.722009789943695,
"signal/advantage_pre_scale_abs_mean": 0.10756922066211701,
"signal/advantage_pre_scale_std": 0.18065415024757386,
"signal/advantage_std": 0.9832139730453491,
"signal/batch_coverage_0/centered_abs_mean": 0.1854369521141052,
"signal/batch_coverage_0/group_std_mean": 0.23206678926944732,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23619085252285005,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01854369565844536,
"signal/brier_reward/centered_abs_mean": 0.15676779747009278,
"signal/brier_reward/group_std_mean": 0.20251947045326232,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19893690645694734,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01567677892744541,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.048682890459895135,
"signal/confidence_uniqueness_reward/group_std_mean": 0.08520418256521226,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06293119937181473,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004868289083242416,
"signal/format_reward/centered_abs_mean": 0.03848741352558136,
"signal/format_reward/group_std_mean": 0.07385490760207176,
"signal/format_reward/group_zero_std_frac": 0.6972222208976746,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.2500002160668373,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01924370676279068,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3430877685546875,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4109717309474945,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4361910581588745,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03430877774953842,
"step": 85
},
{
"calibration/aurc": 0.12301126961784772,
"calibration/batch_distribution_entropy": 0.9331017295414563,
"calibration/batch_entropy_100bins": 0.9381418424454226,
"calibration/batch_entropy_10bins": 0.9331017295414563,
"calibration/batch_entropy_50bins": 0.9462402462271073,
"calibration/batch_uniqueness": 0.9423444019843231,
"calibration/buffer_distribution_entropy": 0.9202602316335554,
"calibration/buffer_entropy_100bins": 0.9351472758276472,
"calibration/buffer_entropy_10bins": 0.9202602316335554,
"calibration/buffer_entropy_50bins": 0.9418012111533827,
"calibration/confidence_entropy": 0.4612580112288266,
"calibration/coverage@0%": 0.08628837167868916,
"calibration/coverage@1%": 0.12313788358115851,
"calibration/coverage@10%": 0.49191285867993495,
"calibration/coverage@15%": 0.6691996369614962,
"calibration/coverage@20%": 0.8408953364435259,
"calibration/coverage@25%": 0.9078131047843024,
"calibration/coverage@30%": 0.9499502310974621,
"calibration/coverage@5%": 0.2717661114689772,
"calibration/distribution_entropy_10": 0.9331017295414563,
"calibration/distribution_entropy_100": 0.9381418424454226,
"calibration/ece": 0.12834374377029065,
"calibration/mean_confidence": 0.6120755217205869,
"calibration/unique_confidence_per_question": 0.9484375,
"calibration/unique_confidences": 364.2,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.050694444444444445,
"completions/max_length": 2518.6,
"completions/max_terminated_length": 2518.6,
"completions/mean_length": 554.4845703125,
"completions/mean_terminated_length": 584.150537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.4,
"epoch": 0.2159973000337496,
"grad_norm": 0.005482567939907312,
"learning_rate": 3.5542168674698798e-06,
"loss": -0.1229,
"num_tokens": 182262249.0,
"reward": 0.9727075695991516,
"reward_std": 0.1719568520784378,
"rewards/accuracy_reward": 0.6586805582046509,
"rewards/batch_coverage_0": 0.31638202667236326,
"rewards/brier_reward": 0.7820794343948364,
"rewards/confidence_uniqueness_reward": 0.8977606773376465,
"rewards/format_reward": 0.9493055462837219,
"rewards/frontier_entropy_batch_reward": -0.3090770959854126,
"signal/accuracy_reward/centered_abs_mean": 0.14839409589767455,
"signal/accuracy_reward/group_std_mean": 0.1954744428396225,
"signal/accuracy_reward/group_zero_std_frac": 0.4444444477558136,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8960744976997376,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07419704794883727,
"signal/advantage_abs_mean": 0.7284149289131164,
"signal/advantage_pre_scale_abs_mean": 0.12656941562891005,
"signal/advantage_pre_scale_std": 0.21349311172962188,
"signal/advantage_std": 0.9832640409469604,
"signal/batch_coverage_0/centered_abs_mean": 0.1728483885526657,
"signal/batch_coverage_0/group_std_mean": 0.21621764600276946,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20976388156414033,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01728483885526657,
"signal/brier_reward/centered_abs_mean": 0.16384423673152923,
"signal/brier_reward/group_std_mean": 0.2075037896633148,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19847835898399352,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016384424082934855,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08556002229452134,
"signal/confidence_uniqueness_reward/group_std_mean": 0.12629829049110414,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.10300634056329727,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008556002285331488,
"signal/format_reward/centered_abs_mean": 0.0777560755610466,
"signal/format_reward/group_std_mean": 0.11778671145439149,
"signal/format_reward/group_zero_std_frac": 0.6083333492279053,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.465447860956192,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0388780377805233,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33695358633995054,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40606662034988406,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4137517988681793,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03369535878300667,
"step": 90
},
{
"calibration/aurc": 0.17871668297594143,
"calibration/batch_distribution_entropy": 0.9630264574407592,
"calibration/batch_entropy_100bins": 0.953211917202391,
"calibration/batch_entropy_10bins": 0.9630264574407592,
"calibration/batch_entropy_50bins": 0.9643523891448694,
"calibration/batch_uniqueness": 0.9492980405470421,
"calibration/buffer_distribution_entropy": 0.9238771059311623,
"calibration/buffer_entropy_100bins": 0.9393248125495454,
"calibration/buffer_entropy_10bins": 0.9238771059311623,
"calibration/buffer_entropy_50bins": 0.9451495151031304,
"calibration/confidence_entropy": 0.4737993854396472,
"calibration/coverage@0%": 0.07179699490704275,
"calibration/coverage@1%": 0.07179699490704275,
"calibration/coverage@10%": 0.37936506941291626,
"calibration/coverage@15%": 0.5761405617386479,
"calibration/coverage@20%": 0.63375926663008,
"calibration/coverage@25%": 0.6948149018962416,
"calibration/coverage@30%": 0.7423140495867768,
"calibration/coverage@5%": 0.15318227629232414,
"calibration/distribution_entropy_10": 0.9630264574407592,
"calibration/distribution_entropy_100": 0.953211917202391,
"calibration/ece": 0.14511776380834202,
"calibration/mean_confidence": 0.5647102282125718,
"calibration/unique_confidence_per_question": 0.9744791666666668,
"calibration/unique_confidences": 374.2,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03229166666666668,
"completions/max_length": 3165.2,
"completions/max_terminated_length": 3165.2,
"completions/mean_length": 608.8990478515625,
"completions/mean_terminated_length": 629.1340454101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.4,
"epoch": 0.22799715003562457,
"grad_norm": 0.003710468765348196,
"learning_rate": 3.4036144578313257e-06,
"loss": -0.09,
"num_tokens": 192368446.0,
"reward": 0.9795109272003174,
"reward_std": 0.15554179847240449,
"rewards/accuracy_reward": 0.6442708373069763,
"rewards/batch_coverage_0": 0.32638593316078185,
"rewards/brier_reward": 0.7930331349372863,
"rewards/confidence_uniqueness_reward": 0.9162774443626404,
"rewards/format_reward": 0.9677083373069764,
"rewards/frontier_entropy_batch_reward": -0.30048283040523527,
"signal/accuracy_reward/centered_abs_mean": 0.14325086772441864,
"signal/accuracy_reward/group_std_mean": 0.19051359593868256,
"signal/accuracy_reward/group_zero_std_frac": 0.45555556416511533,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.91033376455307,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07162543386220932,
"signal/advantage_abs_mean": 0.7236616730690002,
"signal/advantage_pre_scale_abs_mean": 0.11148717403411865,
"signal/advantage_pre_scale_std": 0.18938724100589752,
"signal/advantage_std": 0.9832141995429993,
"signal/batch_coverage_0/centered_abs_mean": 0.165391007065773,
"signal/batch_coverage_0/group_std_mean": 0.21068246960639953,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2101570636034012,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.016539101675152778,
"signal/brier_reward/centered_abs_mean": 0.15271125733852386,
"signal/brier_reward/group_std_mean": 0.19472625851631165,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19408320188522338,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.015271125920116901,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.061642435193061826,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09935838431119919,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07879238203167915,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006164243910461664,
"signal/format_reward/centered_abs_mean": 0.05253906324505806,
"signal/format_reward/group_std_mean": 0.08931401669979096,
"signal/format_reward/group_zero_std_frac": 0.669444453716278,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.335909104347229,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.02626953162252903,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32878002524375916,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40042484402656553,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4183509528636932,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.032878002524375914,
"step": 95
},
{
"calibration/aurc": 0.1454110969088863,
"calibration/batch_distribution_entropy": 0.9613831954056984,
"calibration/batch_entropy_100bins": 0.9493371779893444,
"calibration/batch_entropy_10bins": 0.9613831954056984,
"calibration/batch_entropy_50bins": 0.9593376772305942,
"calibration/batch_uniqueness": 0.9485584531953885,
"calibration/buffer_distribution_entropy": 0.9277519896654418,
"calibration/buffer_entropy_100bins": 0.943189982754457,
"calibration/buffer_entropy_10bins": 0.9277519896654418,
"calibration/buffer_entropy_50bins": 0.9483683845977355,
"calibration/confidence_entropy": 0.4596341598052634,
"calibration/coverage@0%": 0.04663136393391499,
"calibration/coverage@1%": 0.04663136393391499,
"calibration/coverage@10%": 0.3281475131843537,
"calibration/coverage@15%": 0.6423179933915689,
"calibration/coverage@20%": 0.7606484419125537,
"calibration/coverage@25%": 0.8922567125625301,
"calibration/coverage@30%": 0.9526003355271648,
"calibration/coverage@5%": 0.08622573245379841,
"calibration/distribution_entropy_10": 0.9613831954056984,
"calibration/distribution_entropy_100": 0.9493371779893444,
"calibration/ece": 0.154073196829507,
"calibration/mean_confidence": 0.5689832250180208,
"calibration/unique_confidence_per_question": 0.9723958333333332,
"calibration/unique_confidences": 373.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029340277777777767,
"completions/max_length": 3467.8,
"completions/max_terminated_length": 3467.8,
"completions/mean_length": 663.86259765625,
"completions/mean_terminated_length": 683.8996948242187,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.23999700003749952,
"grad_norm": 0.019849933683872223,
"learning_rate": 3.2530120481927713e-06,
"loss": -0.0789,
"num_tokens": 203115215.0,
"reward": 0.9892123222351075,
"reward_std": 0.1588016241788864,
"rewards/accuracy_reward": 0.6633680582046508,
"rewards/batch_coverage_0": 0.30833343267440794,
"rewards/brier_reward": 0.7872236728668213,
"rewards/confidence_uniqueness_reward": 0.919810950756073,
"rewards/format_reward": 0.9704861164093017,
"rewards/frontier_entropy_batch_reward": -0.2925153970718384,
"signal/accuracy_reward/centered_abs_mean": 0.1627821147441864,
"signal/accuracy_reward/group_std_mean": 0.2125428557395935,
"signal/accuracy_reward/group_zero_std_frac": 0.4111111044883728,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0067266941070556,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0813910573720932,
"signal/advantage_abs_mean": 0.7315644145011901,
"signal/advantage_pre_scale_abs_mean": 0.1147262141108513,
"signal/advantage_pre_scale_std": 0.189102640748024,
"signal/advantage_std": 0.9832417488098144,
"signal/batch_coverage_0/centered_abs_mean": 0.18047589659690857,
"signal/batch_coverage_0/group_std_mean": 0.22958629131317138,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22420001327991484,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01804758906364441,
"signal/brier_reward/centered_abs_mean": 0.1647311270236969,
"signal/brier_reward/group_std_mean": 0.20903607010841369,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20256026089191437,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016473113372921944,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.05859675332903862,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09514907449483871,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07238487452268601,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005859675444662571,
"signal/format_reward/centered_abs_mean": 0.04894748292863369,
"signal/format_reward/group_std_mean": 0.08429807126522064,
"signal/format_reward/group_zero_std_frac": 0.6833333373069763,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.29983839094638826,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.024473741464316846,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33697280287742615,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4080432951450348,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.420040899515152,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033697281032800674,
"step": 100
},
{
"epoch": 0.23999700003749952,
"eval_completions/clipped_ratio": 0.021527777777777774,
"eval_completions/max_length": 2685.8333333333335,
"eval_completions/max_terminated_length": 2685.8333333333335,
"eval_completions/mean_length": 686.6156514485677,
"eval_completions/mean_terminated_length": 701.6330871582031,
"eval_completions/min_length": 0.0,
"eval_completions/min_terminated_length": 228.66666666666666,
"eval_loss": 0.0,
"eval_num_tokens": 203115215.0,
"eval_reward": 0.8787461519241333,
"eval_reward_std": 0.2547052949666977,
"eval_rewards/accuracy_reward": 0.6397569477558136,
"eval_rewards/batch_coverage_0": 0.02475695653508107,
"eval_rewards/brier_reward": 0.7724942763646444,
"eval_rewards/confidence_uniqueness_reward": 0.874758760134379,
"eval_rewards/format_reward": 0.9791666666666666,
"eval_rewards/frontier_entropy_batch_reward": -0.9791666666666666,
"eval_runtime": 207.3585,
"eval_samples_per_second": 4.823,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4423285573720932,
"eval_signal/accuracy_reward/group_std_mean": 0.4765505294005076,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8821068207422892,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2211642786860466,
"eval_signal/advantage_abs_mean": 0.8672481079896291,
"eval_signal/advantage_pre_scale_abs_mean": 0.22098190089066824,
"eval_signal/advantage_pre_scale_std": 0.25322698801755905,
"eval_signal/advantage_std": 0.9864300489425659,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.19854053358236948,
"eval_signal/batch_coverage_0/group_std_mean": 0.2833152214686076,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07916944163541,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.01985405369972189,
"eval_signal/brier_reward/centered_abs_mean": 0.2045312076807022,
"eval_signal/brier_reward/group_std_mean": 0.26509985824426013,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08176350096861522,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.020453120892246563,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.0659493872274955,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.12324381371339162,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.026245065964758396,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006594939002146323,
"eval_signal/format_reward/centered_abs_mean": 0.03993055565903584,
"eval_signal/format_reward/group_std_mean": 0.10589349828660488,
"eval_signal/format_reward/group_zero_std_frac": 0.4444444527228673,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.07909915472070377,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.01996527782951792,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.03993055565903584,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.10589349828660488,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.4444444527228673,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.015819831596066553,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.003993055705601971,
"eval_steps_per_second": 0.029,
"step": 100
},
{
"epoch": 0.23999700003749952,
"step": 100,
"train_probe_completions/clipped_ratio": 0.025868055555555585,
"train_probe_completions/max_length": 2853.0,
"train_probe_completions/max_terminated_length": 2853.0,
"train_probe_completions/mean_length": 692.7083638509115,
"train_probe_completions/mean_terminated_length": 711.0718892415365,
"train_probe_completions/min_length": 0.0,
"train_probe_completions/min_terminated_length": 184.5,
"train_probe_loss": 0.0,
"train_probe_num_tokens": 203115215.0,
"train_probe_reward": 0.8864819407463074,
"train_probe_reward_std": 0.25598659614721936,
"train_probe_rewards/accuracy_reward": 0.6631944477558136,
"train_probe_rewards/batch_coverage_0": 0.016140021577787895,
"train_probe_rewards/brier_reward": 0.7635823786258698,
"train_probe_rewards/confidence_uniqueness_reward": 0.8732908368110657,
"train_probe_rewards/format_reward": 0.9739583432674408,
"train_probe_rewards/frontier_entropy_batch_reward": -0.9739583432674408,
"train_probe_runtime": 213.2664,
"train_probe_samples_per_second": 4.689,
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.4340277810891469,
"train_probe_signal/accuracy_reward/group_std_mean": 0.4723463257153829,
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8704935510953268,
"train_probe_signal/accuracy_reward/weight": 0.5,
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.21701389054457346,
"train_probe_signal/advantage_abs_mean": 0.854099969069163,
"train_probe_signal/advantage_pre_scale_abs_mean": 0.2182364066441854,
"train_probe_signal/advantage_pre_scale_std": 0.2556636830170949,
"train_probe_signal/advantage_std": 0.9864280919233958,
"train_probe_signal/batch_coverage_0/centered_abs_mean": 0.2105137606461843,
"train_probe_signal/batch_coverage_0/group_std_mean": 0.30563366661469143,
"train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.08499650408824284,
"train_probe_signal/batch_coverage_0/weight": 0.10000000149011612,
"train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.021051375816265743,
"train_probe_signal/brier_reward/centered_abs_mean": 0.21621683984994888,
"train_probe_signal/brier_reward/group_std_mean": 0.2767111559708913,
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08661133795976639,
"train_probe_signal/brier_reward/weight": 0.10000000149011612,
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.021621684543788433,
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.07127842927972476,
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.12808794404069582,
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.028270625198880833,
"train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007127842943494518,
"train_probe_signal/format_reward/centered_abs_mean": 0.04893663106486201,
"train_probe_signal/format_reward/group_std_mean": 0.11285140831023455,
"train_probe_signal/format_reward/group_zero_std_frac": 0.4722222362955411,
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0958357664446036,
"train_probe_signal/format_reward/weight": 0.5,
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.024468315532431006,
"train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.04893663106486201,
"train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.11285140831023455,
"train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.4722222362955411,
"train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.019167154173677165,
"train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0048936633781219525,
"train_probe_steps_per_second": 0.028
},
{
"calibration/aurc": 0.32709221037996683,
"calibration/batch_distribution_entropy": 0.9072256917534363,
"calibration/batch_entropy_100bins": 0.9236496931399495,
"calibration/batch_entropy_10bins": 0.9072256917534363,
"calibration/batch_entropy_50bins": 0.927115252359181,
"calibration/batch_uniqueness": 0.9367734004126167,
"calibration/buffer_distribution_entropy": 0.9319828898045401,
"calibration/buffer_entropy_100bins": 0.9473722336840288,
"calibration/buffer_entropy_10bins": 0.9319828898045401,
"calibration/buffer_entropy_50bins": 0.9516387033787922,
"calibration/confidence_entropy": 0.4495748465857103,
"calibration/coverage@0%": 0.033092714381499924,
"calibration/coverage@1%": 0.033092714381499924,
"calibration/coverage@10%": 0.15477837853260484,
"calibration/coverage@15%": 0.17701470819007947,
"calibration/coverage@20%": 0.2013024808479678,
"calibration/coverage@25%": 0.23189123197141104,
"calibration/coverage@30%": 0.504432124883315,
"calibration/coverage@5%": 0.12432486411686663,
"calibration/distribution_entropy_10": 0.9072256917534363,
"calibration/distribution_entropy_100": 0.9236496931399495,
"calibration/ece": 0.15953709471533112,
"calibration/mean_confidence": 0.6542690738937268,
"calibration/unique_confidence_per_question": 0.9598958333333332,
"calibration/unique_confidences": 368.6,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.030208333333333327,
"completions/max_length": 3699.2,
"completions/max_terminated_length": 3699.2,
"completions/mean_length": 666.5580078125,
"completions/mean_terminated_length": 687.2699340820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.6,
"epoch": 0.2519968500393745,
"grad_norm": 0.07229447364807129,
"learning_rate": 3.1024096385542172e-06,
"loss": -0.0794,
"num_tokens": 213870827.0,
"reward": 0.9622564077377319,
"reward_std": 0.16706106960773467,
"rewards/accuracy_reward": 0.6428819298744202,
"rewards/batch_coverage_0": 0.26482947170734406,
"rewards/brier_reward": 0.7719120621681214,
"rewards/confidence_uniqueness_reward": 0.9088397026062012,
"rewards/format_reward": 0.9683159708976745,
"rewards/frontier_entropy_batch_reward": -0.3790066301822662,
"signal/accuracy_reward/centered_abs_mean": 0.16609157919883727,
"signal/accuracy_reward/group_std_mean": 0.22139038443565368,
"signal/accuracy_reward/group_zero_std_frac": 0.36666667461395264,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9980636358261108,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08304578959941863,
"signal/advantage_abs_mean": 0.7221752285957337,
"signal/advantage_pre_scale_abs_mean": 0.12088042944669723,
"signal/advantage_pre_scale_std": 0.1985163629055023,
"signal/advantage_std": 0.983276081085205,
"signal/batch_coverage_0/centered_abs_mean": 0.15856701135635376,
"signal/batch_coverage_0/group_std_mean": 0.20437993705272675,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.18988114297389985,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015856701508164407,
"signal/brier_reward/centered_abs_mean": 0.16616459488868712,
"signal/brier_reward/group_std_mean": 0.21281179487705232,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19939744472503662,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016616460122168063,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06404128819704055,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10072248876094818,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07632052302360534,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006404128670692444,
"signal/format_reward/centered_abs_mean": 0.05047200433909893,
"signal/format_reward/group_std_mean": 0.08542852699756623,
"signal/format_reward/group_zero_std_frac": 0.6861111164093018,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.29700973331928254,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.025236002169549465,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3599489688873291,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4289889752864838,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43422353863716123,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03599489852786064,
"step": 105
},
{
"calibration/aurc": 0.1445495184896525,
"calibration/batch_distribution_entropy": 0.9549703359188151,
"calibration/batch_entropy_100bins": 0.9496826389779457,
"calibration/batch_entropy_10bins": 0.9549703359188151,
"calibration/batch_entropy_50bins": 0.9596300438920267,
"calibration/batch_uniqueness": 0.9464544686187354,
"calibration/buffer_distribution_entropy": 0.9319959695872686,
"calibration/buffer_entropy_100bins": 0.9489525242674711,
"calibration/buffer_entropy_10bins": 0.9319959695872686,
"calibration/buffer_entropy_50bins": 0.9523803251403093,
"calibration/confidence_entropy": 0.4915952441003196,
"calibration/coverage@0%": 0.04353477677542926,
"calibration/coverage@1%": 0.04353477677542926,
"calibration/coverage@10%": 0.42479945249786555,
"calibration/coverage@15%": 0.575662717001929,
"calibration/coverage@20%": 0.7581317016685564,
"calibration/coverage@25%": 0.8631562383772495,
"calibration/coverage@30%": 0.9326810288101164,
"calibration/coverage@5%": 0.19128571148926302,
"calibration/distribution_entropy_10": 0.9549703359188151,
"calibration/distribution_entropy_100": 0.9496826389779457,
"calibration/ece": 0.1360942299199772,
"calibration/mean_confidence": 0.5788540314486034,
"calibration/unique_confidence_per_question": 0.984375,
"calibration/unique_confidences": 378.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.020225694444444442,
"completions/max_length": 3580.6,
"completions/max_terminated_length": 3580.6,
"completions/mean_length": 682.2786499023438,
"completions/mean_terminated_length": 696.4890258789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.4,
"epoch": 0.2639967000412495,
"grad_norm": 0.032034844160079956,
"learning_rate": 2.9518072289156627e-06,
"loss": -0.0528,
"num_tokens": 224839125.0,
"reward": 1.006770396232605,
"reward_std": 0.15200525820255278,
"rewards/accuracy_reward": 0.6963541746139527,
"rewards/batch_coverage_0": 0.24029523730278016,
"rewards/brier_reward": 0.7906111955642701,
"rewards/confidence_uniqueness_reward": 0.9291220664978027,
"rewards/format_reward": 0.9788194417953491,
"rewards/frontier_entropy_batch_reward": -0.268192446231842,
"signal/accuracy_reward/centered_abs_mean": 0.1638563334941864,
"signal/accuracy_reward/group_std_mean": 0.22195173501968385,
"signal/accuracy_reward/group_zero_std_frac": 0.35,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8842053294181824,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0819281667470932,
"signal/advantage_abs_mean": 0.7184135675430298,
"signal/advantage_pre_scale_abs_mean": 0.10958259105682373,
"signal/advantage_pre_scale_std": 0.17912587225437165,
"signal/advantage_std": 0.9834012746810913,
"signal/batch_coverage_0/centered_abs_mean": 0.1551068753004074,
"signal/batch_coverage_0/group_std_mean": 0.19941962957382203,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.16745518445968627,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015510687604546547,
"signal/brier_reward/centered_abs_mean": 0.14882863759994508,
"signal/brier_reward/group_std_mean": 0.1928631156682968,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16044945418834686,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.014882863871753216,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.041838298365473744,
"signal/confidence_uniqueness_reward/group_std_mean": 0.06950674280524254,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04490874633193016,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004183829948306084,
"signal/format_reward/centered_abs_mean": 0.03174913227558136,
"signal/format_reward/group_std_mean": 0.057792513817548755,
"signal/format_reward/group_zero_std_frac": 0.7666666746139527,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.16912963092327118,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01587456613779068,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.31741302013397216,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3890170991420746,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.3446439206600189,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03174130246043205,
"step": 110
},
{
"calibration/aurc": 0.24997302512551908,
"calibration/batch_distribution_entropy": 0.948771182454743,
"calibration/batch_entropy_100bins": 0.9441092968976221,
"calibration/batch_entropy_10bins": 0.948771182454743,
"calibration/batch_entropy_50bins": 0.955403885102182,
"calibration/batch_uniqueness": 0.9460239538764702,
"calibration/buffer_distribution_entropy": 0.936164731264576,
"calibration/buffer_entropy_100bins": 0.9522699329038218,
"calibration/buffer_entropy_10bins": 0.936164731264576,
"calibration/buffer_entropy_50bins": 0.9554519538114729,
"calibration/confidence_entropy": 0.4640606387456156,
"calibration/coverage@0%": 0.01047088844822663,
"calibration/coverage@1%": 0.01047088844822663,
"calibration/coverage@10%": 0.09750734469069952,
"calibration/coverage@15%": 0.3214186635337727,
"calibration/coverage@20%": 0.4891239852148862,
"calibration/coverage@25%": 0.5843370164004027,
"calibration/coverage@30%": 0.6630702631818101,
"calibration/coverage@5%": 0.033055108306382655,
"calibration/distribution_entropy_10": 0.948771182454743,
"calibration/distribution_entropy_100": 0.9441092968976221,
"calibration/ece": 0.1190749094862839,
"calibration/mean_confidence": 0.5852567080340979,
"calibration/unique_confidence_per_question": 0.9880208333333332,
"calibration/unique_confidences": 379.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02152777777777779,
"completions/max_length": 3657.2,
"completions/max_terminated_length": 3657.2,
"completions/mean_length": 685.8245727539063,
"completions/mean_terminated_length": 701.0185913085937,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.6,
"epoch": 0.27599655004312446,
"grad_norm": 0.01640477403998375,
"learning_rate": 2.8012048192771087e-06,
"loss": -0.0579,
"num_tokens": 235819024.0,
"reward": 0.9847165465354919,
"reward_std": 0.15014655888080597,
"rewards/accuracy_reward": 0.6589409708976746,
"rewards/batch_coverage_0": 0.27372472286224364,
"rewards/brier_reward": 0.7927528858184815,
"rewards/confidence_uniqueness_reward": 0.9244337797164917,
"rewards/format_reward": 0.9782986044883728,
"rewards/frontier_entropy_batch_reward": -0.3299441754817963,
"signal/accuracy_reward/centered_abs_mean": 0.16461046040058136,
"signal/accuracy_reward/group_std_mean": 0.21322064697742463,
"signal/accuracy_reward/group_zero_std_frac": 0.4,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0106230854988099,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08230523020029068,
"signal/advantage_abs_mean": 0.7277908205986023,
"signal/advantage_pre_scale_abs_mean": 0.11005394756793976,
"signal/advantage_pre_scale_std": 0.17997059226036072,
"signal/advantage_std": 0.9832545518875122,
"signal/batch_coverage_0/centered_abs_mean": 0.16282705068588257,
"signal/batch_coverage_0/group_std_mean": 0.2089460790157318,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.19948122501373292,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0162827055901289,
"signal/brier_reward/centered_abs_mean": 0.15371213257312774,
"signal/brier_reward/group_std_mean": 0.197787806391716,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18880485296249389,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.015371213294565677,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.046393326669931415,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07716395705938339,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05672838613390922,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004639332601800561,
"signal/format_reward/centered_abs_mean": 0.03493923582136631,
"signal/format_reward/group_std_mean": 0.06415542513132096,
"signal/format_reward/group_zero_std_frac": 0.7416666507720947,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.21280283629894256,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.017469617910683154,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3413366138935089,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4106238782405853,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41952593326568605,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034133663028478624,
"step": 115
},
{
"calibration/aurc": 0.27797711120421953,
"calibration/batch_distribution_entropy": 0.9583379741927558,
"calibration/batch_entropy_100bins": 0.9493557397434813,
"calibration/batch_entropy_10bins": 0.9583379741927558,
"calibration/batch_entropy_50bins": 0.9613567988695655,
"calibration/batch_uniqueness": 0.9479972252470749,
"calibration/buffer_distribution_entropy": 0.9377245142123376,
"calibration/buffer_entropy_100bins": 0.9542313286000972,
"calibration/buffer_entropy_10bins": 0.9377245142123376,
"calibration/buffer_entropy_50bins": 0.9569445223722793,
"calibration/confidence_entropy": 0.45816828200534754,
"calibration/coverage@0%": 0.025490337368947603,
"calibration/coverage@1%": 0.025490337368947603,
"calibration/coverage@10%": 0.20452225123288423,
"calibration/coverage@15%": 0.2787226282083809,
"calibration/coverage@20%": 0.5025003751481274,
"calibration/coverage@25%": 0.5424779528109738,
"calibration/coverage@30%": 0.5757674726677722,
"calibration/coverage@5%": 0.07545553963080058,
"calibration/distribution_entropy_10": 0.9583379741927558,
"calibration/distribution_entropy_100": 0.9493557397434813,
"calibration/ece": 0.14685373933054077,
"calibration/mean_confidence": 0.5651412098927935,
"calibration/unique_confidence_per_question": 0.9776041666666666,
"calibration/unique_confidences": 375.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01961805555555556,
"completions/max_length": 3235.8,
"completions/max_terminated_length": 3235.8,
"completions/mean_length": 642.6586059570312,
"completions/mean_terminated_length": 655.4541381835937,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.8,
"epoch": 0.28799640004499943,
"grad_norm": 0.015719039365649223,
"learning_rate": 2.6506024096385547e-06,
"loss": -0.055,
"num_tokens": 246304307.0,
"reward": 0.9861522912979126,
"reward_std": 0.14733969867229463,
"rewards/accuracy_reward": 0.6541666746139526,
"rewards/batch_coverage_0": 0.28446308374404905,
"rewards/brier_reward": 0.7799728155136109,
"rewards/confidence_uniqueness_reward": 0.927653694152832,
"rewards/format_reward": 0.9799479246139526,
"rewards/frontier_entropy_batch_reward": -0.30113983154296875,
"signal/accuracy_reward/centered_abs_mean": 0.16872830241918563,
"signal/accuracy_reward/group_std_mean": 0.21660387516021729,
"signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0460703372955322,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08436415120959281,
"signal/advantage_abs_mean": 0.7369883894920349,
"signal/advantage_pre_scale_abs_mean": 0.10920974463224412,
"signal/advantage_pre_scale_std": 0.1773868203163147,
"signal/advantage_std": 0.9832382082939148,
"signal/batch_coverage_0/centered_abs_mean": 0.17697641849517823,
"signal/batch_coverage_0/group_std_mean": 0.22625457048416137,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2210148811340332,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017697642371058465,
"signal/brier_reward/centered_abs_mean": 0.15970203280448914,
"signal/brier_reward/group_std_mean": 0.20490642786026,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19861160218715668,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.015970203652977945,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04404056295752525,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07342168614268303,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.055048702657222746,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004404056537896395,
"signal/format_reward/centered_abs_mean": 0.0325575090944767,
"signal/format_reward/group_std_mean": 0.06021936684846878,
"signal/format_reward/group_zero_std_frac": 0.7555555701255798,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.20300004184246062,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01627875454723835,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3320270121097565,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4021295964717865,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4149349987506866,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03320269919931888,
"step": 120
},
{
"calibration/aurc": 0.1703143158395383,
"calibration/batch_distribution_entropy": 0.8794585073573937,
"calibration/batch_entropy_100bins": 0.9106494560303057,
"calibration/batch_entropy_10bins": 0.8794585073573937,
"calibration/batch_entropy_50bins": 0.9141756705557457,
"calibration/batch_uniqueness": 0.9305113696453565,
"calibration/buffer_distribution_entropy": 0.9404959265968345,
"calibration/buffer_entropy_100bins": 0.9565428692936477,
"calibration/buffer_entropy_10bins": 0.9404959265968345,
"calibration/buffer_entropy_50bins": 0.9589872588913677,
"calibration/confidence_entropy": 0.4253087345674131,
"calibration/coverage@0%": 0.09905896770438485,
"calibration/coverage@1%": 0.15895480103771814,
"calibration/coverage@10%": 0.4012652997253823,
"calibration/coverage@15%": 0.4946182213996466,
"calibration/coverage@20%": 0.5521577999911764,
"calibration/coverage@25%": 0.7789902226812445,
"calibration/coverage@30%": 0.871652319279875,
"calibration/coverage@5%": 0.3014460083867995,
"calibration/distribution_entropy_10": 0.8794585073573937,
"calibration/distribution_entropy_100": 0.9106494560303057,
"calibration/ece": 0.16335647487649493,
"calibration/mean_confidence": 0.6267505431558249,
"calibration/unique_confidence_per_question": 0.990625,
"calibration/unique_confidences": 380.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.011631944444444464,
"completions/max_length": 3299.4,
"completions/max_terminated_length": 3299.4,
"completions/mean_length": 625.4529663085938,
"completions/mean_terminated_length": 632.8467529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.4,
"epoch": 0.2999962500468744,
"grad_norm": 0.016347240656614304,
"learning_rate": 2.5e-06,
"loss": -0.0321,
"num_tokens": 256627189.0,
"reward": 1.0023396015167236,
"reward_std": 0.1366099238395691,
"rewards/accuracy_reward": 0.6794270873069763,
"rewards/batch_coverage_0": 0.3166307270526886,
"rewards/brier_reward": 0.8103621363639831,
"rewards/confidence_uniqueness_reward": 0.9277669072151185,
"rewards/format_reward": 0.9872395753860473,
"rewards/frontier_entropy_batch_reward": -0.3646974146366119,
"signal/accuracy_reward/centered_abs_mean": 0.16190863847732545,
"signal/accuracy_reward/group_std_mean": 0.21255030035972594,
"signal/accuracy_reward/group_zero_std_frac": 0.4083333373069763,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.1290634512901305,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08095431923866273,
"signal/advantage_abs_mean": 0.7284621834754944,
"signal/advantage_pre_scale_abs_mean": 0.09951311200857163,
"signal/advantage_pre_scale_std": 0.16377390921115875,
"signal/advantage_std": 0.9830830574035645,
"signal/batch_coverage_0/centered_abs_mean": 0.16607835292816162,
"signal/batch_coverage_0/group_std_mean": 0.21322621107101442,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23173038959503173,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.016607835702598096,
"signal/brier_reward/centered_abs_mean": 0.14223978221416472,
"signal/brier_reward/group_std_mean": 0.1856682389974594,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19865721762180327,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01422397829592228,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.036846403032541275,
"signal/confidence_uniqueness_reward/group_std_mean": 0.06017006188631058,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05238675698637962,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0036846403498202564,
"signal/format_reward/centered_abs_mean": 0.02101236991584301,
"signal/format_reward/group_std_mean": 0.041347318515181544,
"signal/format_reward/group_zero_std_frac": 0.8222222328186035,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.14590775668621064,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.010506184957921504,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33323261737823484,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40458549857139586,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47181462645530703,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03332326300442219,
"step": 125
},
{
"calibration/aurc": 0.2300665525636191,
"calibration/batch_distribution_entropy": 0.9289155334598537,
"calibration/batch_entropy_100bins": 0.9339730823409967,
"calibration/batch_entropy_10bins": 0.9289155334598537,
"calibration/batch_entropy_50bins": 0.9422619695279744,
"calibration/batch_uniqueness": 0.9426163443601956,
"calibration/buffer_distribution_entropy": 0.9394723763941414,
"calibration/buffer_entropy_100bins": 0.9569623231573903,
"calibration/buffer_entropy_10bins": 0.9394723763941414,
"calibration/buffer_entropy_50bins": 0.9588223705223482,
"calibration/confidence_entropy": 0.44351004871113436,
"calibration/coverage@0%": 0.0154343832961903,
"calibration/coverage@1%": 0.0154343832961903,
"calibration/coverage@10%": 0.20882918884792864,
"calibration/coverage@15%": 0.3081636308958342,
"calibration/coverage@20%": 0.38748598522643307,
"calibration/coverage@25%": 0.5834554325420772,
"calibration/coverage@30%": 0.8136605973157437,
"calibration/coverage@5%": 0.1259290743318735,
"calibration/distribution_entropy_10": 0.9289155334598537,
"calibration/distribution_entropy_100": 0.9339730823409967,
"calibration/ece": 0.14460488876040145,
"calibration/mean_confidence": 0.5764310270988868,
"calibration/unique_confidence_per_question": 0.9838541666666668,
"calibration/unique_confidences": 377.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008854166666666653,
"completions/max_length": 2783.2,
"completions/max_terminated_length": 2783.2,
"completions/mean_length": 621.380224609375,
"completions/mean_terminated_length": 627.0984619140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.4,
"epoch": 0.3119961000487494,
"grad_norm": 0.006308301351964474,
"learning_rate": 2.349397590361446e-06,
"loss": -0.0272,
"num_tokens": 266910289.0,
"reward": 0.9967981100082397,
"reward_std": 0.13324756622314454,
"rewards/accuracy_reward": 0.6637152791023254,
"rewards/batch_coverage_0": 0.2850965529680252,
"rewards/brier_reward": 0.8013586163520813,
"rewards/confidence_uniqueness_reward": 0.9356141209602356,
"rewards/format_reward": 0.9899305462837219,
"rewards/frontier_entropy_batch_reward": -0.32231796681880953,
"signal/accuracy_reward/centered_abs_mean": 0.1640516459941864,
"signal/accuracy_reward/group_std_mean": 0.21628546714782715,
"signal/accuracy_reward/group_zero_std_frac": 0.38055555820465087,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0400630950927734,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0820258229970932,
"signal/advantage_abs_mean": 0.7400351524353027,
"signal/advantage_pre_scale_abs_mean": 0.09946749657392502,
"signal/advantage_pre_scale_std": 0.1557171642780304,
"signal/advantage_std": 0.9832014203071594,
"signal/batch_coverage_0/centered_abs_mean": 0.15798864066600798,
"signal/batch_coverage_0/group_std_mean": 0.20295707881450653,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20411037504673005,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01579886469990015,
"signal/brier_reward/centered_abs_mean": 0.1348998188972473,
"signal/brier_reward/group_std_mean": 0.1756774067878723,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1731540232896805,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01348998285830021,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.031015625223517418,
"signal/confidence_uniqueness_reward/group_std_mean": 0.04782265685498714,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04015704616904259,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003101562592200935,
"signal/format_reward/centered_abs_mean": 0.016655815846752374,
"signal/format_reward/group_std_mean": 0.03051824141293764,
"signal/format_reward/group_zero_std_frac": 0.8777777791023255,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.10784837445244193,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.008327907923376187,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33705700635910035,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4063593685626984,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43399363160133364,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03370570130646229,
"step": 130
},
{
"calibration/aurc": 0.2010499629419042,
"calibration/batch_distribution_entropy": 0.9453438710641814,
"calibration/batch_entropy_100bins": 0.9417344096389343,
"calibration/batch_entropy_10bins": 0.9453438710641814,
"calibration/batch_entropy_50bins": 0.9516687021452327,
"calibration/batch_uniqueness": 0.9439579691190396,
"calibration/buffer_distribution_entropy": 0.9419838075350185,
"calibration/buffer_entropy_100bins": 0.9589754047081909,
"calibration/buffer_entropy_10bins": 0.9419838075350185,
"calibration/buffer_entropy_50bins": 0.96063677512858,
"calibration/confidence_entropy": 0.46578609429290746,
"calibration/coverage@0%": 0.20367177080733212,
"calibration/coverage@1%": 0.21411563503709713,
"calibration/coverage@10%": 0.3944984289031729,
"calibration/coverage@15%": 0.4422044108251454,
"calibration/coverage@20%": 0.5067000551427593,
"calibration/coverage@25%": 0.5444308952584296,
"calibration/coverage@30%": 0.6695770806183983,
"calibration/coverage@5%": 0.32690936871855925,
"calibration/distribution_entropy_10": 0.9453438710641814,
"calibration/distribution_entropy_100": 0.9417344096389343,
"calibration/ece": 0.179603961910554,
"calibration/mean_confidence": 0.5899262525496647,
"calibration/unique_confidence_per_question": 0.9963541666666668,
"calibration/unique_confidences": 382.6,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006076388888888862,
"completions/max_length": 3393.2,
"completions/max_terminated_length": 3393.2,
"completions/mean_length": 594.621435546875,
"completions/mean_terminated_length": 598.2032958984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.32399595005062437,
"grad_norm": 0.010317265056073666,
"learning_rate": 2.1987951807228917e-06,
"loss": -0.0126,
"num_tokens": 276853352.0,
"reward": 1.0097585678100587,
"reward_std": 0.12289493680000305,
"rewards/accuracy_reward": 0.6751736044883728,
"rewards/batch_coverage_0": 0.2959615230560303,
"rewards/brier_reward": 0.8135364770889282,
"rewards/confidence_uniqueness_reward": 0.9416105151176453,
"rewards/format_reward": 0.99375,
"rewards/frontier_entropy_batch_reward": -0.2981413632631302,
"signal/accuracy_reward/centered_abs_mean": 0.15569661557674408,
"signal/accuracy_reward/group_std_mean": 0.20491576492786406,
"signal/accuracy_reward/group_zero_std_frac": 0.4138888895511627,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0529741644859314,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07784830778837204,
"signal/advantage_abs_mean": 0.7428248643875122,
"signal/advantage_pre_scale_abs_mean": 0.09276586472988128,
"signal/advantage_pre_scale_std": 0.1466542512178421,
"signal/advantage_std": 0.9831327557563782,
"signal/batch_coverage_0/centered_abs_mean": 0.15543493628501892,
"signal/batch_coverage_0/group_std_mean": 0.19824867248535155,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21039265990257264,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015543493255972863,
"signal/brier_reward/centered_abs_mean": 0.12734594643115998,
"signal/brier_reward/group_std_mean": 0.1648596316576004,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17233424186706542,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012734594009816646,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02444492354989052,
"signal/confidence_uniqueness_reward/group_std_mean": 0.037316303700208664,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03319260329008102,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0024444923736155035,
"signal/format_reward/centered_abs_mean": 0.01032986119389534,
"signal/format_reward/group_std_mean": 0.020031702518463135,
"signal/format_reward/group_zero_std_frac": 0.9138889074325561,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.07024011090397835,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00516493059694767,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3326454758644104,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40271806716918945,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4504189193248749,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03326454721391201,
"step": 135
},
{
"calibration/aurc": 0.11405225671003227,
"calibration/batch_distribution_entropy": 0.9285432797078601,
"calibration/batch_entropy_100bins": 0.9358449348977624,
"calibration/batch_entropy_10bins": 0.9285432797078601,
"calibration/batch_entropy_50bins": 0.9439536592451411,
"calibration/batch_uniqueness": 0.9417474476857131,
"calibration/buffer_distribution_entropy": 0.9481680561778119,
"calibration/buffer_entropy_100bins": 0.9653146440201216,
"calibration/buffer_entropy_10bins": 0.9481680561778119,
"calibration/buffer_entropy_50bins": 0.9656940039647847,
"calibration/confidence_entropy": 0.4419780302764675,
"calibration/coverage@0%": 0.022544080303125238,
"calibration/coverage@1%": 0.07241284670732472,
"calibration/coverage@10%": 0.610283779597179,
"calibration/coverage@15%": 0.7543212120658326,
"calibration/coverage@20%": 0.8502777944575561,
"calibration/coverage@25%": 0.9230536656641395,
"calibration/coverage@30%": 0.9827837926509186,
"calibration/coverage@5%": 0.31054481087742014,
"calibration/distribution_entropy_10": 0.9285432797078601,
"calibration/distribution_entropy_100": 0.9358449348977624,
"calibration/ece": 0.12129232013378737,
"calibration/mean_confidence": 0.6215515878447262,
"calibration/unique_confidence_per_question": 0.9921875,
"calibration/unique_confidences": 381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00972222222222221,
"completions/max_length": 3051.4,
"completions/max_terminated_length": 3051.4,
"completions/mean_length": 600.0604248046875,
"completions/mean_terminated_length": 605.9631225585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.33599580005249935,
"grad_norm": 0.004826192278414965,
"learning_rate": 2.0481927710843377e-06,
"loss": -0.0332,
"num_tokens": 286870272.0,
"reward": 1.0001838564872743,
"reward_std": 0.1291646659374237,
"rewards/accuracy_reward": 0.6635416626930237,
"rewards/batch_coverage_0": 0.3250649034976959,
"rewards/brier_reward": 0.8216192960739136,
"rewards/confidence_uniqueness_reward": 0.9341175317764282,
"rewards/format_reward": 0.9896701335906982,
"rewards/frontier_entropy_batch_reward": -0.3450227200984955,
"signal/accuracy_reward/centered_abs_mean": 0.14829643964767455,
"signal/accuracy_reward/group_std_mean": 0.1988508701324463,
"signal/accuracy_reward/group_zero_std_frac": 0.42222222685813904,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0239773392677307,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07414821982383728,
"signal/advantage_abs_mean": 0.7267651915550232,
"signal/advantage_pre_scale_abs_mean": 0.09410272091627121,
"signal/advantage_pre_scale_std": 0.1546729475259781,
"signal/advantage_std": 0.9830945611000061,
"signal/batch_coverage_0/centered_abs_mean": 0.15927667319774627,
"signal/batch_coverage_0/group_std_mean": 0.20444611310958863,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22108587622642517,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015927667170763014,
"signal/brier_reward/centered_abs_mean": 0.13157424628734588,
"signal/brier_reward/group_std_mean": 0.17232659459114075,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18126892745494844,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01315742488950491,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03295594230294228,
"signal/confidence_uniqueness_reward/group_std_mean": 0.055471654236316684,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.044615660607814786,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003295594220981002,
"signal/format_reward/centered_abs_mean": 0.018386501539498567,
"signal/format_reward/group_std_mean": 0.03843978680670261,
"signal/format_reward/group_zero_std_frac": 0.8277777910232544,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.11741685792803765,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.009193250769749283,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3423487484455109,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4122697412967682,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4785375475883484,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034234875440597536,
"step": 140
},
{
"calibration/aurc": 0.17159364528919666,
"calibration/batch_distribution_entropy": 0.9739341873668828,
"calibration/batch_entropy_100bins": 0.9591711536611062,
"calibration/batch_entropy_10bins": 0.9739341873668828,
"calibration/batch_entropy_50bins": 0.9695936724147591,
"calibration/batch_uniqueness": 0.9512091916551567,
"calibration/buffer_distribution_entropy": 0.9587097224168127,
"calibration/buffer_entropy_100bins": 0.9746741480605607,
"calibration/buffer_entropy_10bins": 0.9587097224168127,
"calibration/buffer_entropy_50bins": 0.9736304805691599,
"calibration/confidence_entropy": 0.46751717394040365,
"calibration/coverage@0%": 0.026225070786173317,
"calibration/coverage@1%": 0.026225070786173317,
"calibration/coverage@10%": 0.32045087218771207,
"calibration/coverage@15%": 0.5025195465554969,
"calibration/coverage@20%": 0.6570699821959377,
"calibration/coverage@25%": 0.7701132983193975,
"calibration/coverage@30%": 0.875789496312478,
"calibration/coverage@5%": 0.09018677412997969,
"calibration/distribution_entropy_10": 0.9739341873668828,
"calibration/distribution_entropy_100": 0.9591711536611062,
"calibration/ece": 0.15216885313654238,
"calibration/mean_confidence": 0.5343308437468959,
"calibration/unique_confidence_per_question": 0.9942708333333334,
"calibration/unique_confidences": 381.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009722222222222188,
"completions/max_length": 3167.2,
"completions/max_terminated_length": 3167.2,
"completions/mean_length": 608.954248046875,
"completions/mean_terminated_length": 614.9598876953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.34799565005437433,
"grad_norm": 0.0030671849381178617,
"learning_rate": 1.8975903614457832e-06,
"loss": -0.0267,
"num_tokens": 296950033.0,
"reward": 1.0304441452026367,
"reward_std": 0.1264051616191864,
"rewards/accuracy_reward": 0.7199652671813965,
"rewards/batch_coverage_0": 0.29423797130584717,
"rewards/brier_reward": 0.8002641916275024,
"rewards/confidence_uniqueness_reward": 0.9402836084365844,
"rewards/format_reward": 0.9901041507720947,
"rewards/frontier_entropy_batch_reward": -0.28069123029708865,
"signal/accuracy_reward/centered_abs_mean": 0.1349392354488373,
"signal/accuracy_reward/group_std_mean": 0.19156003892421722,
"signal/accuracy_reward/group_zero_std_frac": 0.4055555582046509,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8481933355331421,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06746961772441865,
"signal/advantage_abs_mean": 0.7143575072288513,
"signal/advantage_pre_scale_abs_mean": 0.08871349543333054,
"signal/advantage_pre_scale_std": 0.14817543923854828,
"signal/advantage_std": 0.9832207202911377,
"signal/batch_coverage_0/centered_abs_mean": 0.15694086849689484,
"signal/batch_coverage_0/group_std_mean": 0.1989602953195572,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.19811130166053773,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015694086998701097,
"signal/brier_reward/centered_abs_mean": 0.13146734237670898,
"signal/brier_reward/group_std_mean": 0.173081836104393,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16585685014724733,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013146734982728957,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.028389297798275946,
"signal/confidence_uniqueness_reward/group_std_mean": 0.047778960317373276,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.035612285137176514,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002838929882273078,
"signal/format_reward/centered_abs_mean": 0.016731770522892476,
"signal/format_reward/group_std_mean": 0.033717308193445206,
"signal/format_reward/group_zero_std_frac": 0.8527777791023254,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.104011070728302,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.008365885261446238,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.329537171125412,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39947680830955506,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41544281244277953,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.032953718304634096,
"step": 145
},
{
"calibration/aurc": 0.17155475015271432,
"calibration/batch_distribution_entropy": 0.9508950837242326,
"calibration/batch_entropy_100bins": 0.9499402094362974,
"calibration/batch_entropy_10bins": 0.9508950837242326,
"calibration/batch_entropy_50bins": 0.9593933804461509,
"calibration/batch_uniqueness": 0.9471473333291687,
"calibration/buffer_distribution_entropy": 0.9677614387212963,
"calibration/buffer_entropy_100bins": 0.9817337736389484,
"calibration/buffer_entropy_10bins": 0.9677614387212963,
"calibration/buffer_entropy_50bins": 0.9798977340998449,
"calibration/confidence_entropy": 0.445313271491532,
"calibration/coverage@0%": 0.07768557054417162,
"calibration/coverage@1%": 0.1580005311740929,
"calibration/coverage@10%": 0.429632352443371,
"calibration/coverage@15%": 0.5252005253547066,
"calibration/coverage@20%": 0.5991877003084655,
"calibration/coverage@25%": 0.7273865506431193,
"calibration/coverage@30%": 0.7858148091855133,
"calibration/coverage@5%": 0.3040409427787214,
"calibration/distribution_entropy_10": 0.9508950837242326,
"calibration/distribution_entropy_100": 0.9499402094362974,
"calibration/ece": 0.16646348737058445,
"calibration/mean_confidence": 0.531541467788136,
"calibration/unique_confidence_per_question": 0.9880208333333333,
"calibration/unique_confidences": 379.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010763888888888884,
"completions/max_length": 3309.8,
"completions/max_terminated_length": 3309.8,
"completions/mean_length": 682.9143310546875,
"completions/mean_terminated_length": 690.3553955078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.6,
"epoch": 0.3599955000562493,
"grad_norm": 0.004152393434196711,
"learning_rate": 1.7469879518072292e-06,
"loss": -0.0342,
"num_tokens": 307927542.0,
"reward": 1.0145195960998534,
"reward_std": 0.12927872389554979,
"rewards/accuracy_reward": 0.6892361044883728,
"rewards/batch_coverage_0": 0.3265324205160141,
"rewards/brier_reward": 0.8156694173812866,
"rewards/confidence_uniqueness_reward": 0.9351176023483276,
"rewards/format_reward": 0.9892361164093018,
"rewards/frontier_entropy_batch_reward": -0.32448467910289763,
"signal/accuracy_reward/centered_abs_mean": 0.1487847238779068,
"signal/accuracy_reward/group_std_mean": 0.19472098350524902,
"signal/accuracy_reward/group_zero_std_frac": 0.4472222328186035,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0139795064926147,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0743923619389534,
"signal/advantage_abs_mean": 0.731607973575592,
"signal/advantage_pre_scale_abs_mean": 0.09385995417833329,
"signal/advantage_pre_scale_std": 0.1545901983976364,
"signal/advantage_std": 0.9830900192260742,
"signal/batch_coverage_0/centered_abs_mean": 0.166290482878685,
"signal/batch_coverage_0/group_std_mean": 0.21268681883811952,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23350246846675873,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.016629048623144627,
"signal/brier_reward/centered_abs_mean": 0.13412976413965225,
"signal/brier_reward/group_std_mean": 0.17652139365673064,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18553486466407776,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013412976451218129,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03293977566063404,
"signal/confidence_uniqueness_reward/group_std_mean": 0.05978764072060585,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04623020067811012,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003293977631255984,
"signal/format_reward/centered_abs_mean": 0.01946614570915699,
"signal/format_reward/group_std_mean": 0.044223295897245406,
"signal/format_reward/group_zero_std_frac": 0.7888888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.13444773107767105,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.009733072854578495,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33882314562797544,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40830708146095274,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47709122896194456,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03388231247663498,
"step": 150
},
{
"epoch": 0.3599955000562493,
"eval_completions/clipped_ratio": 0.020833333333333353,
"eval_completions/max_length": 2312.6666666666665,
"eval_completions/max_terminated_length": 2312.6666666666665,
"eval_completions/mean_length": 630.3422037760416,
"eval_completions/mean_terminated_length": 643.8012288411459,
"eval_completions/min_length": 44.666666666666664,
"eval_completions/min_terminated_length": 222.83333333333334,
"eval_loss": 0.0,
"eval_num_tokens": 307927542.0,
"eval_reward": 0.9089752435684204,
"eval_reward_std": 0.24734783669312796,
"eval_rewards/accuracy_reward": 0.6935763955116272,
"eval_rewards/batch_coverage_0": 0.027789496661474306,
"eval_rewards/brier_reward": 0.8029025296370188,
"eval_rewards/confidence_uniqueness_reward": 0.874511311451594,
"eval_rewards/format_reward": 0.9791666766007742,
"eval_rewards/frontier_entropy_batch_reward": -0.9791666766007742,
"eval_runtime": 187.017,
"eval_samples_per_second": 5.347,
"eval_signal/accuracy_reward/centered_abs_mean": 0.41259765625,
"eval_signal/accuracy_reward/group_std_mean": 0.4600823372602463,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8547090888023376,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.206298828125,
"eval_signal/advantage_abs_mean": 0.8410639564196268,
"eval_signal/advantage_pre_scale_abs_mean": 0.20800520231326422,
"eval_signal/advantage_pre_scale_std": 0.2465300957361857,
"eval_signal/advantage_std": 0.9864153365294138,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.18049288044373193,
"eval_signal/batch_coverage_0/group_std_mean": 0.2696962629755338,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07506928406655788,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.018049288696298998,
"eval_signal/brier_reward/centered_abs_mean": 0.200759785870711,
"eval_signal/brier_reward/group_std_mean": 0.2616179163257281,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0827964333196481,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.020075978711247444,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.06461226319273312,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.1164653729647398,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02636661970367034,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006461226458971699,
"eval_signal/format_reward/centered_abs_mean": 0.039605035446584225,
"eval_signal/format_reward/group_std_mean": 0.09938833986719449,
"eval_signal/format_reward/group_zero_std_frac": 0.5000000049670538,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.07894899075229962,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.019802517723292112,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.039605035446584225,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.09938833986719449,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.5000000049670538,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.01578979877134164,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.00396050369211783,
"eval_steps_per_second": 0.032,
"step": 150
},
{
"epoch": 0.3599955000562493,
"step": 150,
"train_probe_completions/clipped_ratio": 0.011284722222222229,
"train_probe_completions/max_length": 2123.1666666666665,
"train_probe_completions/max_terminated_length": 2123.1666666666665,
"train_probe_completions/mean_length": 653.1609598795573,
"train_probe_completions/mean_terminated_length": 660.668711344401,
"train_probe_completions/min_length": 35.833333333333336,
"train_probe_completions/min_terminated_length": 181.83333333333334,
"train_probe_loss": 0.0,
"train_probe_num_tokens": 307927542.0,
"train_probe_reward": 0.9152792493502299,
"train_probe_reward_std": 0.23713775972525278,
"train_probe_rewards/accuracy_reward": 0.7005208333333334,
"train_probe_rewards/batch_coverage_0": 0.02929902666558822,
"train_probe_rewards/brier_reward": 0.8055415451526642,
"train_probe_rewards/confidence_uniqueness_reward": 0.8778475622336069,
"train_probe_rewards/format_reward": 0.984375,
"train_probe_rewards/frontier_entropy_batch_reward": -0.984375,
"train_probe_runtime": 204.3154,
"train_probe_samples_per_second": 4.894,
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.4099934895833333,
"train_probe_signal/accuracy_reward/group_std_mean": 0.45888521273930866,
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8822846511999766,
"train_probe_signal/accuracy_reward/weight": 0.5,
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.20499674479166666,
"train_probe_signal/advantage_abs_mean": 0.8449337383111318,
"train_probe_signal/advantage_pre_scale_abs_mean": 0.20016596963008246,
"train_probe_signal/advantage_pre_scale_std": 0.23646700382232666,
"train_probe_signal/advantage_std": 0.9863986372947693,
"train_probe_signal/batch_coverage_0/centered_abs_mean": 0.18984342366456985,
"train_probe_signal/batch_coverage_0/group_std_mean": 0.27815695852041245,
"train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.08140804183979829,
"train_probe_signal/batch_coverage_0/weight": 0.10000000149011612,
"train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.018984342149148386,
"train_probe_signal/brier_reward/centered_abs_mean": 0.1932607168952624,
"train_probe_signal/brier_reward/group_std_mean": 0.25661252935727435,
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08301879713932674,
"train_probe_signal/brier_reward/weight": 0.10000000149011612,
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.01932607094446818,
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.06211579963564873,
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.10437597334384918,
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.026644666989644367,
"train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006211580087741216,
"train_probe_signal/format_reward/centered_abs_mean": 0.02983940951526165,
"train_probe_signal/format_reward/group_std_mean": 0.07643071375787258,
"train_probe_signal/format_reward/group_zero_std_frac": 0.6111111293236414,
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.06301099962244432,
"train_probe_signal/format_reward/weight": 0.5,
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.014919704757630825,
"train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.02983940951526165,
"train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.07643071375787258,
"train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.6111111293236414,
"train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.012602200809245309,
"train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0029839409980922937,
"train_probe_steps_per_second": 0.029
},
{
"calibration/aurc": 0.13047708267607877,
"calibration/batch_distribution_entropy": 0.9780475321298369,
"calibration/batch_entropy_100bins": 0.9625704525202989,
"calibration/batch_entropy_10bins": 0.9780475321298369,
"calibration/batch_entropy_50bins": 0.9725882234384929,
"calibration/batch_uniqueness": 0.9517862803241908,
"calibration/buffer_distribution_entropy": 0.9735897148741166,
"calibration/buffer_entropy_100bins": 0.9859773805939941,
"calibration/buffer_entropy_10bins": 0.9735897148741166,
"calibration/buffer_entropy_50bins": 0.9837481165719518,
"calibration/confidence_entropy": 0.48740500431683287,
"calibration/coverage@0%": 0.04358208749052296,
"calibration/coverage@1%": 0.09358208749052296,
"calibration/coverage@10%": 0.4876644552477252,
"calibration/coverage@15%": 0.5870458527081308,
"calibration/coverage@20%": 0.7345223454503469,
"calibration/coverage@25%": 0.8802459049707746,
"calibration/coverage@30%": 0.9559009361150557,
"calibration/coverage@5%": 0.3262937880016433,
"calibration/distribution_entropy_10": 0.9780475321298369,
"calibration/distribution_entropy_100": 0.9625704525202989,
"calibration/ece": 0.21096636475852878,
"calibration/mean_confidence": 0.5727649671475417,
"calibration/unique_confidence_per_question": 0.9890625,
"calibration/unique_confidences": 379.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.019357638888888883,
"completions/max_length": 3017.6,
"completions/max_terminated_length": 3017.6,
"completions/mean_length": 637.5643188476563,
"completions/mean_terminated_length": 650.1926635742187,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.6,
"epoch": 0.3719953500581243,
"grad_norm": 0.0035331202670931816,
"learning_rate": 1.5963855421686747e-06,
"loss": -0.0546,
"num_tokens": 318379995.0,
"reward": 1.0356850147247314,
"reward_std": 0.14273865818977355,
"rewards/accuracy_reward": 0.7408854126930237,
"rewards/batch_coverage_0": 0.3165908634662628,
"rewards/brier_reward": 0.8121230244636536,
"rewards/confidence_uniqueness_reward": 0.9271683931350708,
"rewards/format_reward": 0.9801215171813965,
"rewards/frontier_entropy_batch_reward": -0.3040671467781067,
"signal/accuracy_reward/centered_abs_mean": 0.15713433027267457,
"signal/accuracy_reward/group_std_mean": 0.2059522569179535,
"signal/accuracy_reward/group_zero_std_frac": 0.42222222685813904,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0711824655532838,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07856716513633728,
"signal/advantage_abs_mean": 0.7309226036071778,
"signal/advantage_pre_scale_abs_mean": 0.10310240536928177,
"signal/advantage_pre_scale_std": 0.17670319974422455,
"signal/advantage_std": 0.9831022620201111,
"signal/batch_coverage_0/centered_abs_mean": 0.16185049712657928,
"signal/batch_coverage_0/group_std_mean": 0.20479914247989656,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2245582729578018,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.016185050085186958,
"signal/brier_reward/centered_abs_mean": 0.13712520897388458,
"signal/brier_reward/group_std_mean": 0.17701683342456817,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18834921419620515,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013712521269917488,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04509064331650734,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07520343959331513,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06165208369493484,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0045090644154697655,
"signal/format_reward/centered_abs_mean": 0.03320855014026165,
"signal/format_reward/group_std_mean": 0.061374055594205855,
"signal/format_reward/group_zero_std_frac": 0.7527777910232544,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.22536550164222718,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.016604275070130826,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33104270696640015,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39966055750846863,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4580074489116669,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033104270696640015,
"step": 155
},
{
"calibration/aurc": 0.0984045730686661,
"calibration/batch_distribution_entropy": 0.8756279425815006,
"calibration/batch_entropy_100bins": 0.9106554886391262,
"calibration/batch_entropy_10bins": 0.8756279425815006,
"calibration/batch_entropy_50bins": 0.9137960166292582,
"calibration/batch_uniqueness": 0.9327047283052717,
"calibration/buffer_distribution_entropy": 0.9735163551188186,
"calibration/buffer_entropy_100bins": 0.9861154699025338,
"calibration/buffer_entropy_10bins": 0.9735163551188186,
"calibration/buffer_entropy_50bins": 0.9837800172609528,
"calibration/confidence_entropy": 0.444702059210076,
"calibration/coverage@0%": 0.1835072163523095,
"calibration/coverage@1%": 0.25085980835864113,
"calibration/coverage@10%": 0.6750243154773228,
"calibration/coverage@15%": 0.754269822612892,
"calibration/coverage@20%": 0.8678158253356143,
"calibration/coverage@25%": 0.9021978021978022,
"calibration/coverage@30%": 0.9181318681318682,
"calibration/coverage@5%": 0.5333837038338781,
"calibration/distribution_entropy_10": 0.8756279425815006,
"calibration/distribution_entropy_100": 0.9106554886391262,
"calibration/ece": 0.11786937630319141,
"calibration/mean_confidence": 0.6930237447381653,
"calibration/unique_confidence_per_question": 0.9671875,
"calibration/unique_confidences": 371.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.036805555555555536,
"completions/max_length": 2880.4,
"completions/max_terminated_length": 2880.4,
"completions/mean_length": 646.36328125,
"completions/mean_terminated_length": 671.0564208984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.6,
"epoch": 0.38399520005999926,
"grad_norm": 0.0026809952687472105,
"learning_rate": 1.4457831325301204e-06,
"loss": -0.1022,
"num_tokens": 328913396.0,
"reward": 0.9842295050621033,
"reward_std": 0.16013213694095613,
"rewards/accuracy_reward": 0.6764756917953492,
"rewards/batch_coverage_0": 0.3238168299198151,
"rewards/brier_reward": 0.7961023449897766,
"rewards/confidence_uniqueness_reward": 0.9045828580856323,
"rewards/format_reward": 0.9625000119209289,
"rewards/frontier_entropy_batch_reward": -0.37708542943000795,
"signal/accuracy_reward/centered_abs_mean": 0.14662000834941863,
"signal/accuracy_reward/group_std_mean": 0.19255328476428984,
"signal/accuracy_reward/group_zero_std_frac": 0.4527777791023254,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0311393976211547,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07331000417470931,
"signal/advantage_abs_mean": 0.7269745111465454,
"signal/advantage_pre_scale_abs_mean": 0.11633765548467637,
"signal/advantage_pre_scale_std": 0.19983015954494476,
"signal/advantage_std": 0.9830737590789795,
"signal/batch_coverage_0/centered_abs_mean": 0.15227404236793518,
"signal/batch_coverage_0/group_std_mean": 0.1953058809041977,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21439152359962463,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015227404795587063,
"signal/brier_reward/centered_abs_mean": 0.1461849495768547,
"signal/brier_reward/group_std_mean": 0.1892971932888031,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20605530440807343,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.014618495106697082,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07116645872592926,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10824198424816131,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.1000448852777481,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007116645760834217,
"signal/format_reward/centered_abs_mean": 0.05997178852558136,
"signal/format_reward/group_std_mean": 0.09567098915576935,
"signal/format_reward/group_zero_std_frac": 0.6666666626930237,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.4201627790927887,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.02998589426279068,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33648825287818906,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40563682913780214,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47566083669662473,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03364882543683052,
"step": 160
},
{
"calibration/aurc": 0.1314698647681308,
"calibration/batch_distribution_entropy": 0.9539865491003766,
"calibration/batch_entropy_100bins": 0.9465729070156297,
"calibration/batch_entropy_10bins": 0.9539865491003766,
"calibration/batch_entropy_50bins": 0.9570973253069107,
"calibration/batch_uniqueness": 0.9471188639087285,
"calibration/buffer_distribution_entropy": 0.9719415474139419,
"calibration/buffer_entropy_100bins": 0.9853135667011523,
"calibration/buffer_entropy_10bins": 0.9719415474139419,
"calibration/buffer_entropy_50bins": 0.9828414865175944,
"calibration/confidence_entropy": 0.4629173760167061,
"calibration/coverage@0%": 0.055678425540073115,
"calibration/coverage@1%": 0.055678425540073115,
"calibration/coverage@10%": 0.5788827150747762,
"calibration/coverage@15%": 0.6855880312968711,
"calibration/coverage@20%": 0.7366116007187298,
"calibration/coverage@25%": 0.7927668933284889,
"calibration/coverage@30%": 0.909690593071274,
"calibration/coverage@5%": 0.35066449037826136,
"calibration/distribution_entropy_10": 0.9539865491003766,
"calibration/distribution_entropy_100": 0.9465729070156297,
"calibration/ece": 0.20537345733881648,
"calibration/mean_confidence": 0.5303268611443963,
"calibration/unique_confidence_per_question": 0.9776041666666666,
"calibration/unique_confidences": 375.4,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.022135416666666675,
"completions/max_length": 3410.4,
"completions/max_terminated_length": 3410.4,
"completions/mean_length": 720.5960205078125,
"completions/mean_terminated_length": 736.8052001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.2,
"epoch": 0.39599505006187424,
"grad_norm": 0.0021246892865747213,
"learning_rate": 1.2951807228915664e-06,
"loss": -0.0594,
"num_tokens": 340353734.0,
"reward": 1.0025168299674987,
"reward_std": 0.13918909132480622,
"rewards/accuracy_reward": 0.6728298664093018,
"rewards/batch_coverage_0": 0.3095873832702637,
"rewards/brier_reward": 0.7944852590560914,
"rewards/confidence_uniqueness_reward": 0.929128909111023,
"rewards/format_reward": 0.9777777791023254,
"rewards/frontier_entropy_batch_reward": -0.2610716551542282,
"signal/accuracy_reward/centered_abs_mean": 0.1396755650639534,
"signal/accuracy_reward/group_std_mean": 0.18935762345790863,
"signal/accuracy_reward/group_zero_std_frac": 0.43888888955116273,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8814050197601319,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0698377825319767,
"signal/advantage_abs_mean": 0.7221243023872376,
"signal/advantage_pre_scale_abs_mean": 0.09935600310564041,
"signal/advantage_pre_scale_std": 0.16630815267562865,
"signal/advantage_std": 0.9832156300544739,
"signal/batch_coverage_0/centered_abs_mean": 0.1702214241027832,
"signal/batch_coverage_0/group_std_mean": 0.21486833691596985,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2145317405462265,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017022141814231874,
"signal/brier_reward/centered_abs_mean": 0.13762031197547914,
"signal/brier_reward/group_std_mean": 0.18010320365428925,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17319863736629487,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013762031495571137,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.045266738161444664,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07511035352945328,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.056387230008840564,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004526673769578338,
"signal/format_reward/centered_abs_mean": 0.03636067621409893,
"signal/format_reward/group_std_mean": 0.06495743244886398,
"signal/format_reward/group_zero_std_frac": 0.7500000119209289,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.22450997531414033,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.018180338107049464,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3157182991504669,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.38673295378684996,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4012265205383301,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03157182894647122,
"step": 165
},
{
"calibration/aurc": 0.11538836511334989,
"calibration/batch_distribution_entropy": 0.925228953185431,
"calibration/batch_entropy_100bins": 0.9315305819400574,
"calibration/batch_entropy_10bins": 0.925228953185431,
"calibration/batch_entropy_50bins": 0.9407669643923352,
"calibration/batch_uniqueness": 0.9407560578101188,
"calibration/buffer_distribution_entropy": 0.9727285409627902,
"calibration/buffer_entropy_100bins": 0.9857416673026582,
"calibration/buffer_entropy_10bins": 0.9727285409627902,
"calibration/buffer_entropy_50bins": 0.983333567489086,
"calibration/confidence_entropy": 0.4530259297587672,
"calibration/coverage@0%": 0.090934427705629,
"calibration/coverage@1%": 0.13260109437229567,
"calibration/coverage@10%": 0.5461385783004304,
"calibration/coverage@15%": 0.7319897814544014,
"calibration/coverage@20%": 0.8508060829526263,
"calibration/coverage@25%": 0.9216212225055326,
"calibration/coverage@30%": 0.9787373503445759,
"calibration/coverage@5%": 0.2340060737792186,
"calibration/distribution_entropy_10": 0.925228953185431,
"calibration/distribution_entropy_100": 0.9315305819400574,
"calibration/ece": 0.10519794172538484,
"calibration/mean_confidence": 0.6474902234638621,
"calibration/unique_confidence_per_question": 0.9885416666666667,
"calibration/unique_confidences": 379.6,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008680555555555535,
"completions/max_length": 3761.0,
"completions/max_terminated_length": 3761.0,
"completions/mean_length": 727.771533203125,
"completions/mean_terminated_length": 734.15712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 224.6,
"epoch": 0.4079949000637492,
"grad_norm": 0.00241382815875113,
"learning_rate": 1.1445783132530121e-06,
"loss": -0.0275,
"num_tokens": 351826846.0,
"reward": 1.032625389099121,
"reward_std": 0.12265074402093887,
"rewards/accuracy_reward": 0.7263020753860474,
"rewards/batch_coverage_0": 0.33543471097946165,
"rewards/brier_reward": 0.8319101929664612,
"rewards/confidence_uniqueness_reward": 0.9344179272651673,
"rewards/format_reward": 0.9912326335906982,
"rewards/frontier_entropy_batch_reward": -0.3631828844547272,
"signal/accuracy_reward/centered_abs_mean": 0.13776584416627885,
"signal/accuracy_reward/group_std_mean": 0.1854351818561554,
"signal/accuracy_reward/group_zero_std_frac": 0.4638888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0262118101119995,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06888292208313943,
"signal/advantage_abs_mean": 0.735342538356781,
"signal/advantage_pre_scale_abs_mean": 0.08866324722766876,
"signal/advantage_pre_scale_std": 0.14827727675437927,
"signal/advantage_std": 0.9829926133155823,
"signal/batch_coverage_0/centered_abs_mean": 0.15526262521743775,
"signal/batch_coverage_0/group_std_mean": 0.19668332040309905,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23164966702461243,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015526262111961842,
"signal/brier_reward/centered_abs_mean": 0.12140081971883773,
"signal/brier_reward/group_std_mean": 0.16084674894809722,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18126706182956695,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012140082381665706,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03103194199502468,
"signal/confidence_uniqueness_reward/group_std_mean": 0.05045064315199852,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04684874601662159,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003103194246068597,
"signal/format_reward/centered_abs_mean": 0.01565212644636631,
"signal/format_reward/group_std_mean": 0.03198937810957432,
"signal/format_reward/group_zero_std_frac": 0.8583333253860473,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.11856901496648789,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007826063223183155,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3486171066761017,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.41613501906394956,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5219741106033325,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034861710667610166,
"step": 170
},
{
"calibration/aurc": 0.08714649584563566,
"calibration/batch_distribution_entropy": 0.9582676263767113,
"calibration/batch_entropy_100bins": 0.9497775003333068,
"calibration/batch_entropy_10bins": 0.9582676263767113,
"calibration/batch_entropy_50bins": 0.9611324354823612,
"calibration/batch_uniqueness": 0.9488688537919441,
"calibration/buffer_distribution_entropy": 0.9723564713598757,
"calibration/buffer_entropy_100bins": 0.9855557420167175,
"calibration/buffer_entropy_10bins": 0.9723564713598757,
"calibration/buffer_entropy_50bins": 0.9831130728598103,
"calibration/confidence_entropy": 0.46930408368645865,
"calibration/coverage@0%": 0.1124440803878803,
"calibration/coverage@1%": 0.1946861438799438,
"calibration/coverage@10%": 0.6197900524033957,
"calibration/coverage@15%": 0.7821157887456747,
"calibration/coverage@20%": 0.9116034973982613,
"calibration/coverage@25%": 0.9745239509945393,
"calibration/coverage@30%": 1.0,
"calibration/coverage@5%": 0.40132178443876604,
"calibration/distribution_entropy_10": 0.9582676263767113,
"calibration/distribution_entropy_100": 0.9497775003333068,
"calibration/ece": 0.20635704384932518,
"calibration/mean_confidence": 0.5645161705073962,
"calibration/unique_confidence_per_question": 0.9864583333333334,
"calibration/unique_confidences": 378.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013454861111111115,
"completions/max_length": 3692.8,
"completions/max_terminated_length": 3692.8,
"completions/mean_length": 756.1865478515625,
"completions/mean_terminated_length": 766.48515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.4,
"epoch": 0.4199947500656242,
"grad_norm": 0.0023003392852842808,
"learning_rate": 9.93975903614458e-07,
"loss": -0.0367,
"num_tokens": 363646083.0,
"reward": 1.0302478551864624,
"reward_std": 0.12525633573532105,
"rewards/accuracy_reward": 0.7182291746139526,
"rewards/batch_coverage_0": 0.32915496826171875,
"rewards/brier_reward": 0.8144567489624024,
"rewards/confidence_uniqueness_reward": 0.934039568901062,
"rewards/format_reward": 0.9863715171813965,
"rewards/frontier_entropy_batch_reward": -0.2981763184070587,
"signal/accuracy_reward/centered_abs_mean": 0.14651692509651185,
"signal/accuracy_reward/group_std_mean": 0.19464356005191802,
"signal/accuracy_reward/group_zero_std_frac": 0.4416666626930237,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.059949767589569,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07325846254825592,
"signal/advantage_abs_mean": 0.7322659015655517,
"signal/advantage_pre_scale_abs_mean": 0.09160526245832443,
"signal/advantage_pre_scale_std": 0.1541106790304184,
"signal/advantage_std": 0.9830336213111878,
"signal/batch_coverage_0/centered_abs_mean": 0.17044160962104798,
"signal/batch_coverage_0/group_std_mean": 0.21640016436576842,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2487411081790924,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017044160701334476,
"signal/brier_reward/centered_abs_mean": 0.13068657219409943,
"signal/brier_reward/group_std_mean": 0.16981444656848907,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18954426646232606,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013068657368421555,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03548645004630089,
"signal/confidence_uniqueness_reward/group_std_mean": 0.055370701104402544,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.051629979908466336,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0035486449021846058,
"signal/format_reward/centered_abs_mean": 0.02208658866584301,
"signal/format_reward/group_std_mean": 0.03925186991691589,
"signal/format_reward/group_zero_std_frac": 0.8444444417953492,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.16051022261381148,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.011043294332921505,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32774901390075684,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3981485068798065,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47704883217811583,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03277490176260471,
"step": 175
},
{
"calibration/aurc": 0.08240090854620943,
"calibration/batch_distribution_entropy": 0.9407075408953339,
"calibration/batch_entropy_100bins": 0.9397327456537774,
"calibration/batch_entropy_10bins": 0.9407075408953339,
"calibration/batch_entropy_50bins": 0.9475896407362339,
"calibration/batch_uniqueness": 0.9445637852645128,
"calibration/buffer_distribution_entropy": 0.9721319964350418,
"calibration/buffer_entropy_100bins": 0.985457787774781,
"calibration/buffer_entropy_10bins": 0.9721319964350418,
"calibration/buffer_entropy_50bins": 0.9829938347745161,
"calibration/confidence_entropy": 0.46792723665558106,
"calibration/coverage@0%": 0.08522190425453813,
"calibration/coverage@1%": 0.12364295688611708,
"calibration/coverage@10%": 0.7188442015658267,
"calibration/coverage@15%": 0.8503315215324696,
"calibration/coverage@20%": 0.9222259642094972,
"calibration/coverage@25%": 0.9571244560487381,
"calibration/coverage@30%": 0.981201044386423,
"calibration/coverage@5%": 0.4305176868635339,
"calibration/distribution_entropy_10": 0.9407075408953339,
"calibration/distribution_entropy_100": 0.9397327456537774,
"calibration/ece": 0.156930699984235,
"calibration/mean_confidence": 0.6234170294374884,
"calibration/unique_confidence_per_question": 0.9822916666666668,
"calibration/unique_confidences": 377.2,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012413194444444465,
"completions/max_length": 3749.6,
"completions/max_terminated_length": 3749.6,
"completions/mean_length": 743.4819458007812,
"completions/mean_terminated_length": 752.8460205078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 243.2,
"epoch": 0.4319946000674992,
"grad_norm": 0.00234069861471653,
"learning_rate": 8.433734939759036e-07,
"loss": -0.0346,
"num_tokens": 375310963.0,
"reward": 1.0280283689498901,
"reward_std": 0.12910850942134858,
"rewards/accuracy_reward": 0.7190972208976746,
"rewards/batch_coverage_0": 0.31788222789764403,
"rewards/brier_reward": 0.8127464532852173,
"rewards/confidence_uniqueness_reward": 0.9341015100479126,
"rewards/format_reward": 0.9874131917953491,
"rewards/frontier_entropy_batch_reward": -0.3169987857341766,
"signal/accuracy_reward/centered_abs_mean": 0.14658203125,
"signal/accuracy_reward/group_std_mean": 0.19198833405971527,
"signal/accuracy_reward/group_zero_std_frac": 0.4583333432674408,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0368584036827087,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.073291015625,
"signal/advantage_abs_mean": 0.7443090558052063,
"signal/advantage_pre_scale_abs_mean": 0.09514848291873931,
"signal/advantage_pre_scale_std": 0.15735865235328675,
"signal/advantage_std": 0.9830684781074523,
"signal/batch_coverage_0/centered_abs_mean": 0.15284710228443146,
"signal/batch_coverage_0/group_std_mean": 0.19452511370182038,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21728379726409913,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015284710563719273,
"signal/brier_reward/centered_abs_mean": 0.12713438272476196,
"signal/brier_reward/group_std_mean": 0.16676026284694673,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18021451830863952,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012713438645005227,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03413993827998638,
"signal/confidence_uniqueness_reward/group_std_mean": 0.056762049347162245,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04856302812695503,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003413993865251541,
"signal/format_reward/centered_abs_mean": 0.02097981758415699,
"signal/format_reward/group_std_mean": 0.04119304716587067,
"signal/format_reward/group_zero_std_frac": 0.8222222328186035,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.14933316856622697,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.010489908792078496,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3364332139492035,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4029194176197052,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4776536524295807,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033643323183059695,
"step": 180
},
{
"calibration/aurc": 0.17622694813717843,
"calibration/batch_distribution_entropy": 0.9483525734268292,
"calibration/batch_entropy_100bins": 0.9437878593028112,
"calibration/batch_entropy_10bins": 0.9483525734268292,
"calibration/batch_entropy_50bins": 0.9537535291394281,
"calibration/batch_uniqueness": 0.9465086091799648,
"calibration/buffer_distribution_entropy": 0.9712713337492183,
"calibration/buffer_entropy_100bins": 0.9850464005867252,
"calibration/buffer_entropy_10bins": 0.9712713337492183,
"calibration/buffer_entropy_50bins": 0.9825057272465818,
"calibration/confidence_entropy": 0.48850146133497957,
"calibration/coverage@0%": 0.041510637632665943,
"calibration/coverage@1%": 0.041510637632665943,
"calibration/coverage@10%": 0.22907567729884618,
"calibration/coverage@15%": 0.42216374269005846,
"calibration/coverage@20%": 0.6706528723898518,
"calibration/coverage@25%": 0.8674514833646343,
"calibration/coverage@30%": 0.9366991892263297,
"calibration/coverage@5%": 0.06203695342213962,
"calibration/distribution_entropy_10": 0.9483525734268292,
"calibration/distribution_entropy_100": 0.9437878593028112,
"calibration/ece": 0.18354075972457048,
"calibration/mean_confidence": 0.5970391973608894,
"calibration/unique_confidence_per_question": 0.9916666666666668,
"calibration/unique_confidences": 380.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010677083333333327,
"completions/max_length": 3388.4,
"completions/max_terminated_length": 3388.4,
"completions/mean_length": 738.7443725585938,
"completions/mean_terminated_length": 746.72666015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.8,
"epoch": 0.44399445006937416,
"grad_norm": 0.0022149120923131704,
"learning_rate": 6.927710843373495e-07,
"loss": -0.0301,
"num_tokens": 386911346.0,
"reward": 1.0222719669342042,
"reward_std": 0.12421630620956421,
"rewards/accuracy_reward": 0.6988715291023254,
"rewards/batch_coverage_0": 0.313459575176239,
"rewards/brier_reward": 0.8119510054588318,
"rewards/confidence_uniqueness_reward": 0.9384174823760987,
"rewards/format_reward": 0.9892361164093018,
"rewards/frontier_entropy_batch_reward": -0.28164616525173186,
"signal/accuracy_reward/centered_abs_mean": 0.1408148854970932,
"signal/accuracy_reward/group_std_mean": 0.18646234571933745,
"signal/accuracy_reward/group_zero_std_frac": 0.46666666865348816,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9637740254402161,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0704074427485466,
"signal/advantage_abs_mean": 0.7450667142868042,
"signal/advantage_pre_scale_abs_mean": 0.09193661212921142,
"signal/advantage_pre_scale_std": 0.14988455176353455,
"signal/advantage_std": 0.9831077575683593,
"signal/batch_coverage_0/centered_abs_mean": 0.15791791081428527,
"signal/batch_coverage_0/group_std_mean": 0.1993875563144684,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21790952682495118,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015791791677474975,
"signal/brier_reward/centered_abs_mean": 0.1271849974989891,
"signal/brier_reward/group_std_mean": 0.16353001594543456,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17506942749023438,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012718500196933746,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.029871857166290282,
"signal/confidence_uniqueness_reward/group_std_mean": 0.049267768114805224,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04109632298350334,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0029871856328099965,
"signal/format_reward/centered_abs_mean": 0.01829427033662796,
"signal/format_reward/group_std_mean": 0.035397492721676825,
"signal/format_reward/group_zero_std_frac": 0.85,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.12542281299829483,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00914713516831398,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3175092935562134,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.38766228556633,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43858484625816346,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0317509301006794,
"step": 185
},
{
"calibration/aurc": 0.12269664298524821,
"calibration/batch_distribution_entropy": 0.9097176166639136,
"calibration/batch_entropy_100bins": 0.9256166673164021,
"calibration/batch_entropy_10bins": 0.9097176166639136,
"calibration/batch_entropy_50bins": 0.9305331134066039,
"calibration/batch_uniqueness": 0.9398041886522405,
"calibration/buffer_distribution_entropy": 0.97077519455814,
"calibration/buffer_entropy_100bins": 0.9848005927504747,
"calibration/buffer_entropy_10bins": 0.97077519455814,
"calibration/buffer_entropy_50bins": 0.9822313921481187,
"calibration/confidence_entropy": 0.4558133308321491,
"calibration/coverage@0%": 0.03350573937121969,
"calibration/coverage@1%": 0.08599917769142966,
"calibration/coverage@10%": 0.499440615097099,
"calibration/coverage@15%": 0.5598557534158825,
"calibration/coverage@20%": 0.882024124561112,
"calibration/coverage@25%": 0.9567708333333332,
"calibration/coverage@30%": 0.9791666666666667,
"calibration/coverage@5%": 0.420323940035368,
"calibration/distribution_entropy_10": 0.9097176166639136,
"calibration/distribution_entropy_100": 0.9256166673164021,
"calibration/ece": 0.14582941996305046,
"calibration/mean_confidence": 0.6561318203686277,
"calibration/unique_confidence_per_question": 0.9953125,
"calibration/unique_confidences": 382.2,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006510416666666674,
"completions/max_length": 3278.8,
"completions/max_terminated_length": 3278.8,
"completions/mean_length": 734.9641479492187,
"completions/mean_terminated_length": 739.8077758789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 231.0,
"epoch": 0.45599430007124914,
"grad_norm": 0.002926592016592622,
"learning_rate": 5.421686746987952e-07,
"loss": -0.0133,
"num_tokens": 398461077.0,
"reward": 1.0405804634094238,
"reward_std": 0.11852861940860748,
"rewards/accuracy_reward": 0.7360243082046509,
"rewards/batch_coverage_0": 0.3142602384090424,
"rewards/brier_reward": 0.8369073390960693,
"rewards/confidence_uniqueness_reward": 0.9390465021133423,
"rewards/format_reward": 0.9933159708976745,
"rewards/frontier_entropy_batch_reward": -0.3311110734939575,
"signal/accuracy_reward/centered_abs_mean": 0.13892686367034912,
"signal/accuracy_reward/group_std_mean": 0.18981125354766845,
"signal/accuracy_reward/group_zero_std_frac": 0.4222222208976746,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0067436337471007,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06946343183517456,
"signal/advantage_abs_mean": 0.727643609046936,
"signal/advantage_pre_scale_abs_mean": 0.08646512627601624,
"signal/advantage_pre_scale_std": 0.14237033128738402,
"signal/advantage_std": 0.9830322980880737,
"signal/batch_coverage_0/centered_abs_mean": 0.13719195425510405,
"signal/batch_coverage_0/group_std_mean": 0.17514330744743348,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.19944521486759187,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013719195872545243,
"signal/brier_reward/centered_abs_mean": 0.11094630658626556,
"signal/brier_reward/group_std_mean": 0.1479420006275177,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16077562868595124,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011094630509614945,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.026065148785710333,
"signal/confidence_uniqueness_reward/group_std_mean": 0.04158492237329483,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03798387497663498,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002606514934450388,
"signal/format_reward/centered_abs_mean": 0.011876085167750716,
"signal/format_reward/group_std_mean": 0.024491559341549875,
"signal/format_reward/group_zero_std_frac": 0.8916666746139527,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.08604965135455131,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.005938042583875358,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3279430866241455,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3947389841079712,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4778772532939911,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03279430828988552,
"step": 190
},
{
"calibration/aurc": 0.13273813775825055,
"calibration/batch_distribution_entropy": 0.9500027347292127,
"calibration/batch_entropy_100bins": 0.9452918277091291,
"calibration/batch_entropy_10bins": 0.9500027347292127,
"calibration/batch_entropy_50bins": 0.9573763993422191,
"calibration/batch_uniqueness": 0.9468649596642662,
"calibration/buffer_distribution_entropy": 0.9689344574900707,
"calibration/buffer_entropy_100bins": 0.9838738701234462,
"calibration/buffer_entropy_10bins": 0.9689344574900707,
"calibration/buffer_entropy_50bins": 0.9811441658406614,
"calibration/confidence_entropy": 0.47158294896673614,
"calibration/coverage@0%": 0.04583078782115089,
"calibration/coverage@1%": 0.04583078782115089,
"calibration/coverage@10%": 0.39430990423665824,
"calibration/coverage@15%": 0.638239374114255,
"calibration/coverage@20%": 0.8346849527195428,
"calibration/coverage@25%": 0.9250275069637883,
"calibration/coverage@30%": 0.9679628597957288,
"calibration/coverage@5%": 0.21747004220711577,
"calibration/distribution_entropy_10": 0.9500027347292127,
"calibration/distribution_entropy_100": 0.9452918277091291,
"calibration/ece": 0.1880414980348,
"calibration/mean_confidence": 0.5949393429926623,
"calibration/unique_confidence_per_question": 0.9682291666666666,
"calibration/unique_confidences": 371.8,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01440972222222221,
"completions/max_length": 3499.8,
"completions/max_terminated_length": 3499.8,
"completions/mean_length": 747.442822265625,
"completions/mean_terminated_length": 758.5841552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.46799415007312406,
"grad_norm": 0.0021557214204221964,
"learning_rate": 3.91566265060241e-07,
"loss": -0.0423,
"num_tokens": 410152482.0,
"reward": 1.0175115585327148,
"reward_std": 0.1258660525083542,
"rewards/accuracy_reward": 0.6973090291023254,
"rewards/batch_coverage_0": 0.33180397748947144,
"rewards/brier_reward": 0.8193559408187866,
"rewards/confidence_uniqueness_reward": 0.9310436129570008,
"rewards/format_reward": 0.9855902791023254,
"rewards/frontier_entropy_batch_reward": -0.3215842306613922,
"signal/accuracy_reward/centered_abs_mean": 0.13668077439069748,
"signal/accuracy_reward/group_std_mean": 0.18262230157852172,
"signal/accuracy_reward/group_zero_std_frac": 0.4750000059604645,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.977854585647583,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06834038719534874,
"signal/advantage_abs_mean": 0.7343301057815552,
"signal/advantage_pre_scale_abs_mean": 0.09162229895591736,
"signal/advantage_pre_scale_std": 0.15324749201536178,
"signal/advantage_std": 0.9830029845237732,
"signal/batch_coverage_0/centered_abs_mean": 0.14713969677686692,
"signal/batch_coverage_0/group_std_mean": 0.18895590007305146,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21866745948791505,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.014713969826698304,
"signal/brier_reward/centered_abs_mean": 0.12192474007606506,
"signal/brier_reward/group_std_mean": 0.1624041885137558,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17810553312301636,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012192474491894246,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03553067147731781,
"signal/confidence_uniqueness_reward/group_std_mean": 0.057105415314435956,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.052296724170446396,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0035530673805624245,
"signal/format_reward/centered_abs_mean": 0.02194010429084301,
"signal/format_reward/group_std_mean": 0.04070322811603546,
"signal/format_reward/group_zero_std_frac": 0.8361111044883728,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.15765180736780166,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.010970052145421506,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32328269481658933,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39172765612602234,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4804633140563965,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03232826963067055,
"step": 195
},
{
"calibration/aurc": 0.13867021776965863,
"calibration/batch_distribution_entropy": 0.9353270684263691,
"calibration/batch_entropy_100bins": 0.9368945411224996,
"calibration/batch_entropy_10bins": 0.9353270684263691,
"calibration/batch_entropy_50bins": 0.9452629006479147,
"calibration/batch_uniqueness": 0.9436675647789519,
"calibration/buffer_distribution_entropy": 0.969523410710188,
"calibration/buffer_entropy_100bins": 0.9841768835974831,
"calibration/buffer_entropy_10bins": 0.969523410710188,
"calibration/buffer_entropy_50bins": 0.9815198335463741,
"calibration/confidence_entropy": 0.48172871548592217,
"calibration/coverage@0%": 0.04717562944155479,
"calibration/coverage@1%": 0.04717562944155479,
"calibration/coverage@10%": 0.4994783742967181,
"calibration/coverage@15%": 0.6450320866998349,
"calibration/coverage@20%": 0.7019051892950392,
"calibration/coverage@25%": 0.9165333986074848,
"calibration/coverage@30%": 0.9618798955613578,
"calibration/coverage@5%": 0.20501936380830985,
"calibration/distribution_entropy_10": 0.9353270684263691,
"calibration/distribution_entropy_100": 0.9368945411224996,
"calibration/ece": 0.1467593867620299,
"calibration/mean_confidence": 0.6385691392790117,
"calibration/unique_confidence_per_question": 0.9979166666666668,
"calibration/unique_confidences": 383.2,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007725694444444442,
"completions/max_length": 3300.4,
"completions/max_terminated_length": 3300.4,
"completions/mean_length": 737.7262329101562,
"completions/mean_terminated_length": 743.5352416992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 241.2,
"epoch": 0.47999400007499904,
"grad_norm": 0.004845115356147289,
"learning_rate": 2.409638554216868e-07,
"loss": -0.0184,
"num_tokens": 421718896.0,
"reward": 1.03243727684021,
"reward_std": 0.11533114612102509,
"rewards/accuracy_reward": 0.7151041626930237,
"rewards/batch_coverage_0": 0.35005253553390503,
"rewards/brier_reward": 0.8234564542770386,
"rewards/confidence_uniqueness_reward": 0.9383493304252625,
"rewards/format_reward": 0.9921874880790711,
"rewards/frontier_entropy_batch_reward": -0.323943692445755,
"signal/accuracy_reward/centered_abs_mean": 0.13002387136220933,
"signal/accuracy_reward/group_std_mean": 0.17043959200382233,
"signal/accuracy_reward/group_zero_std_frac": 0.5166666746139527,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9897404432296752,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06501193568110467,
"signal/advantage_abs_mean": 0.7570445895195007,
"signal/advantage_pre_scale_abs_mean": 0.08690356314182282,
"signal/advantage_pre_scale_std": 0.14214968383312226,
"signal/advantage_std": 0.9829631567001342,
"signal/batch_coverage_0/centered_abs_mean": 0.146352881193161,
"signal/batch_coverage_0/group_std_mean": 0.18568513095378875,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22308144569396973,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.014635288156569004,
"signal/brier_reward/centered_abs_mean": 0.11911999583244323,
"signal/brier_reward/group_std_mean": 0.15573802292346955,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18171513974666595,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011911999993026256,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02734166830778122,
"signal/confidence_uniqueness_reward/group_std_mean": 0.042832625657320024,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04162702783942222,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0027341668028384447,
"signal/format_reward/centered_abs_mean": 0.012923177052289247,
"signal/format_reward/group_std_mean": 0.02554241828620434,
"signal/format_reward/group_zero_std_frac": 0.8888889074325561,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0979671061038971,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006461588526144623,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3360644280910492,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40355847477912904,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5128159046173095,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03360644280910492,
"step": 200
},
{
"epoch": 0.47999400007499904,
"eval_completions/clipped_ratio": 0.006076388888888895,
"eval_completions/max_length": 2380.5,
"eval_completions/max_terminated_length": 2380.5,
"eval_completions/mean_length": 742.8623555501302,
"eval_completions/mean_terminated_length": 747.4195251464844,
"eval_completions/min_length": 83.0,
"eval_completions/min_terminated_length": 232.83333333333334,
"eval_loss": 0.0,
"eval_num_tokens": 421718896.0,
"eval_reward": 0.9218910336494446,
"eval_reward_std": 0.22422286123037338,
"eval_rewards/accuracy_reward": 0.6953125,
"eval_rewards/batch_coverage_0": 0.05044874486823877,
"eval_rewards/brier_reward": 0.8252258896827698,
"eval_rewards/confidence_uniqueness_reward": 0.8909785449504852,
"eval_rewards/format_reward": 0.9939236144224802,
"eval_rewards/frontier_entropy_batch_reward": -0.9939236144224802,
"eval_runtime": 178.0731,
"eval_samples_per_second": 5.616,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4032660573720932,
"eval_signal/accuracy_reward/group_std_mean": 0.4539312819639842,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9079023202260336,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2016330286860466,
"eval_signal/advantage_abs_mean": 0.8540914952754974,
"eval_signal/advantage_pre_scale_abs_mean": 0.19251804302136102,
"eval_signal/advantage_pre_scale_std": 0.22243775675694147,
"eval_signal/advantage_std": 0.9863770306110382,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.17725825061400732,
"eval_signal/batch_coverage_0/group_std_mean": 0.25420698275168735,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07988938316702843,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.017725825930635136,
"eval_signal/brier_reward/centered_abs_mean": 0.17397530128558478,
"eval_signal/brier_reward/group_std_mean": 0.23295909663041434,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07850411782662074,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.01739752929036816,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.046860777462522187,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.069014647975564,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02106233189503352,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004686077823862433,
"eval_signal/format_reward/centered_abs_mean": 0.01177300326526165,
"eval_signal/format_reward/group_std_mean": 0.034373246133327484,
"eval_signal/format_reward/group_zero_std_frac": 0.8055555721124014,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.025194636546075344,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.005886501632630825,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.01177300326526165,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.034373246133327484,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.8055555721124014,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.005038927309215069,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0011773004274194439,
"eval_steps_per_second": 0.034,
"step": 200
},
{
"epoch": 0.47999400007499904,
"step": 200,
"train_probe_completions/clipped_ratio": 0.011284722222222229,
"train_probe_completions/max_length": 2415.6666666666665,
"train_probe_completions/max_terminated_length": 2415.6666666666665,
"train_probe_completions/mean_length": 747.6194661458334,
"train_probe_completions/mean_terminated_length": 756.1844787597656,
"train_probe_completions/min_length": 45.333333333333336,
"train_probe_completions/min_terminated_length": 237.33333333333334,
"train_probe_loss": 0.0,
"train_probe_num_tokens": 421718896.0,
"train_probe_reward": 0.9298777182896932,
"train_probe_reward_std": 0.23085811734199524,
"train_probe_rewards/accuracy_reward": 0.7187499900658926,
"train_probe_rewards/batch_coverage_0": 0.045416015510757766,
"train_probe_rewards/brier_reward": 0.8248612880706787,
"train_probe_rewards/confidence_uniqueness_reward": 0.8833604454994202,
"train_probe_rewards/format_reward": 0.9878472288449606,
"train_probe_rewards/frontier_entropy_batch_reward": -0.9878472288449606,
"train_probe_runtime": 205.2798,
"train_probe_samples_per_second": 4.871,
"train_probe_signal/accuracy_reward/centered_abs_mean": 0.3943142394224803,
"train_probe_signal/accuracy_reward/group_std_mean": 0.4489223013321559,
"train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0,
"train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8741428951422373,
"train_probe_signal/accuracy_reward/weight": 0.5,
"train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.19715711971124014,
"train_probe_signal/advantage_abs_mean": 0.8349892695744833,
"train_probe_signal/advantage_pre_scale_abs_mean": 0.1936338966091474,
"train_probe_signal/advantage_pre_scale_std": 0.23024354626735052,
"train_probe_signal/advantage_std": 0.9863850375016531,
"train_probe_signal/batch_coverage_0/centered_abs_mean": 0.16201606268684068,
"train_probe_signal/batch_coverage_0/group_std_mean": 0.2380441203713417,
"train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07158729247748852,
"train_probe_signal/batch_coverage_0/weight": 0.10000000149011612,
"train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.016201607262094814,
"train_probe_signal/brier_reward/centered_abs_mean": 0.17809604605038962,
"train_probe_signal/brier_reward/group_std_mean": 0.24038559198379517,
"train_probe_signal/brier_reward/group_zero_std_frac": 0.0,
"train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07882465422153473,
"train_probe_signal/brier_reward/weight": 0.10000000149011612,
"train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.017809605070700247,
"train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.0523125467201074,
"train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.09005353599786758,
"train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.023148629814386368,
"train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005231254966929555,
"train_probe_signal/format_reward/centered_abs_mean": 0.023437499689559143,
"train_probe_signal/format_reward/group_std_mean": 0.0657570833961169,
"train_probe_signal/format_reward/group_zero_std_frac": 0.6388889104127884,
"train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0509293849269549,
"train_probe_signal/format_reward/weight": 0.5,
"train_probe_signal/format_reward/weighted_centered_abs_mean": 0.011718749844779571,
"train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.023437499689559143,
"train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.0657570833961169,
"train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.6388889104127884,
"train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.01018587724926571,
"train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0023437499767169356,
"train_probe_steps_per_second": 0.029
},
{
"calibration/aurc": 0.12239767954214946,
"calibration/batch_distribution_entropy": 0.9301196629465762,
"calibration/batch_entropy_100bins": 0.9379092579008269,
"calibration/batch_entropy_10bins": 0.9301196629465762,
"calibration/batch_entropy_50bins": 0.9458583871040073,
"calibration/batch_uniqueness": 0.9434521817203038,
"calibration/buffer_distribution_entropy": 0.9698106620607934,
"calibration/buffer_entropy_100bins": 0.9843158921249298,
"calibration/buffer_entropy_10bins": 0.9698106620607934,
"calibration/buffer_entropy_50bins": 0.9816746594560855,
"calibration/confidence_entropy": 0.4687717514121054,
"calibration/coverage@0%": 0.039820526775052816,
"calibration/coverage@1%": 0.10648719344171949,
"calibration/coverage@10%": 0.4955941593184532,
"calibration/coverage@15%": 0.8548545673739323,
"calibration/coverage@20%": 0.9153638157894737,
"calibration/coverage@25%": 0.9544982456140352,
"calibration/coverage@30%": 0.9946666666666667,
"calibration/coverage@5%": 0.24880902222834572,
"calibration/distribution_entropy_10": 0.9301196629465762,
"calibration/distribution_entropy_100": 0.9379092579008269,
"calibration/ece": 0.18600186920056827,
"calibration/mean_confidence": 0.6150777857385921,
"calibration/unique_confidence_per_question": 0.9921875,
"calibration/unique_confidences": 381.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009461805555555581,
"completions/max_length": 3795.2,
"completions/max_terminated_length": 3795.2,
"completions/mean_length": 751.1654418945312,
"completions/mean_terminated_length": 758.399560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.4,
"epoch": 0.491993850076874,
"grad_norm": 0.0028297537937760353,
"learning_rate": 9.036144578313253e-08,
"loss": -0.0244,
"num_tokens": 433438274.0,
"reward": 1.0493927717208862,
"reward_std": 0.11577230393886566,
"rewards/accuracy_reward": 0.7571180462837219,
"rewards/batch_coverage_0": 0.30967661142349245,
"rewards/brier_reward": 0.8156781673431397,
"rewards/confidence_uniqueness_reward": 0.9383305668830871,
"rewards/format_reward": 0.9903645753860474,
"rewards/frontier_entropy_batch_reward": -0.3071707904338837,
"signal/accuracy_reward/centered_abs_mean": 0.12980685979127884,
"signal/accuracy_reward/group_std_mean": 0.17701960504055023,
"signal/accuracy_reward/group_zero_std_frac": 0.4694444477558136,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9657306671142578,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06490342989563942,
"signal/advantage_abs_mean": 0.737499189376831,
"signal/advantage_pre_scale_abs_mean": 0.08461317420005798,
"signal/advantage_pre_scale_std": 0.14267933070659639,
"signal/advantage_std": 0.982995581626892,
"signal/batch_coverage_0/centered_abs_mean": 0.14645980596542357,
"signal/batch_coverage_0/group_std_mean": 0.18418991863727568,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21881715953350067,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.014645980298519134,
"signal/brier_reward/centered_abs_mean": 0.1212809681892395,
"signal/brier_reward/group_std_mean": 0.15627764761447907,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18062220215797425,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012128096260130405,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.028160134702920912,
"signal/confidence_uniqueness_reward/group_std_mean": 0.044972692430019376,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.041933953016996386,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002816013665869832,
"signal/format_reward/centered_abs_mean": 0.015283203311264515,
"signal/format_reward/group_std_mean": 0.029318978637456895,
"signal/format_reward/group_zero_std_frac": 0.8777777791023255,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.1133392333984375,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007641601655632257,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33200740814208984,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39931764602661135,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4956660270690918,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03320074304938316,
"step": 205
},
{
"calibration/aurc": 0.09601065062995157,
"calibration/batch_distribution_entropy": 0.9517801328093567,
"calibration/batch_entropy_100bins": 0.944623264633357,
"calibration/batch_entropy_10bins": 0.9517801328093567,
"calibration/batch_entropy_50bins": 0.9570065362621402,
"calibration/batch_uniqueness": 0.9459938709762948,
"calibration/buffer_distribution_entropy": 0.9700833201805792,
"calibration/buffer_entropy_100bins": 0.9844871800056595,
"calibration/buffer_entropy_10bins": 0.9700833201805792,
"calibration/buffer_entropy_50bins": 0.981872492355922,
"calibration/confidence_entropy": 0.4824661780354327,
"calibration/coverage@0%": 0.09961418008127575,
"calibration/coverage@1%": 0.1136124302999984,
"calibration/coverage@10%": 0.6836619312925309,
"calibration/coverage@15%": 0.788315298707766,
"calibration/coverage@20%": 0.8868581375108789,
"calibration/coverage@25%": 0.9373368146214099,
"calibration/coverage@30%": 0.9904264577893821,
"calibration/coverage@5%": 0.29698768071745607,
"calibration/distribution_entropy_10": 0.9517801328093567,
"calibration/distribution_entropy_100": 0.944623264633357,
"calibration/ece": 0.12387473016449674,
"calibration/mean_confidence": 0.6253967837519441,
"calibration/unique_confidence_per_question": 0.9956597222222223,
"calibration/unique_confidences": 382.3333333333333,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500000000036,
"completions/max_length": 3279.3333333333335,
"completions/max_terminated_length": 3279.3333333333335,
"completions/mean_length": 759.5371907552084,
"completions/mean_terminated_length": 765.6172485351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 217.33333333333334,
"epoch": 0.49919376007799904,
"num_tokens": 440554243.0,
"reward": 1.028142770131429,
"reward_std": 0.1230898251136144,
"rewards/accuracy_reward": 0.7016782363255819,
"rewards/batch_coverage_0": 0.3386853138605754,
"rewards/brier_reward": 0.8224726319313049,
"rewards/confidence_uniqueness_reward": 0.9401467442512512,
"rewards/format_reward": 0.9921875,
"rewards/frontier_entropy_batch_reward": -0.2892061372598012,
"signal/accuracy_reward/centered_abs_mean": 0.15324797481298447,
"signal/accuracy_reward/group_std_mean": 0.19617354373137155,
"signal/accuracy_reward/group_zero_std_frac": 0.46759259700775146,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0735934575398762,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07662398740649223,
"signal/advantage_abs_mean": 0.7517887155214945,
"signal/advantage_pre_scale_abs_mean": 0.09261459857225418,
"signal/advantage_pre_scale_std": 0.14849475771188736,
"signal/advantage_std": 0.9830325643221537,
"signal/batch_coverage_0/centered_abs_mean": 0.15849936505158743,
"signal/batch_coverage_0/group_std_mean": 0.20025220016638437,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23052164912223816,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01584993799527486,
"signal/brier_reward/centered_abs_mean": 0.12450356781482697,
"signal/brier_reward/group_std_mean": 0.16133702794710794,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1783244808514913,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012450356036424637,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.026240961626172066,
"signal/confidence_uniqueness_reward/group_std_mean": 0.04592407991488775,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.037502444038788475,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0026240963488817215,
"signal/format_reward/centered_abs_mean": 0.013834635416666666,
"signal/format_reward/group_std_mean": 0.031048130244016647,
"signal/format_reward/group_zero_std_frac": 0.8518518606821696,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.09567297746737798,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006917317708333333,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3136086066563924,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3837442596753438,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.45284825563430786,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.031360860293110214,
"step": 208,
"total_flos": 0.0,
"train_loss": -0.04336074522535245,
"train_runtime": 40363.2953,
"train_samples_per_second": 0.372,
"train_steps_per_second": 0.005
}
],
"logging_steps": 5,
"max_steps": 208,
"num_input_tokens_seen": 440554243,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}