Files
RLCR-v4-ks-uniqueness-cov0-…/trainer_state.json
ModelHub XC 946426d444 初始化项目,由ModelHub XC社区提供模型
Model: hector-gr/RLCR-v4-ks-uniqueness-cov0-entropy100-noece-noaurc-scaletrue-batchcov-cold-math
Source: Original Platform
2026-05-01 16:16:34 +08:00

5721 lines
351 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.49919376007799904,
"eval_steps": 50,
"global_step": 208,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calibration/aurc": 0.5113611524759551,
"calibration/batch_distribution_entropy": 0.2727310928788026,
"calibration/confidence_entropy": 0.21980770767433938,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.4699071914305725,
"calibration/mean_confidence": 0.9158092758403239,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01944444444444444,
"completions/max_length": 4048.4,
"completions/max_terminated_length": 4048.4,
"completions/mean_length": 512.9665771484375,
"completions/mean_terminated_length": 523.1480102539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011999850001874977,
"grad_norm": 0.008225883357226849,
"learning_rate": 5.952380952380953e-07,
"loss": 0.0114,
"num_tokens": 9023583.0,
"reward": 0.4356141984462738,
"reward_std": 0.3870311200618744,
"rewards/accuracy_reward": 0.26093749403953553,
"rewards/batch_coverage_0": 0.008399064699187875,
"rewards/batch_coverage_1": 0.008399064699187875,
"rewards/batch_coverage_10": 0.025467225164175034,
"rewards/batch_coverage_15": 0.037757834792137145,
"rewards/batch_coverage_20": 0.06029712557792664,
"rewards/batch_coverage_25": 0.07480189204216003,
"rewards/batch_coverage_5": 0.015028292080387473,
"rewards/brier_reward": 0.3116580307483673,
"rewards/confidence_uniqueness_reward": 0.28452879190444946,
"rewards/format_reward": 0.5988715291023254,
"rewards/frontier_entropy_batch_reward": -0.5720015645027161,
"signal/accuracy_reward/centered_abs_mean": 0.3065212666988373,
"signal/accuracy_reward/group_std_mean": 0.36596824526786803,
"signal/accuracy_reward/group_zero_std_frac": 0.09444444552063942,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.4219505786895752,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.15326063334941864,
"signal/advantage_abs_mean": 0.8588703870773315,
"signal/advantage_pre_scale_abs_mean": 0.3350170612335205,
"signal/advantage_pre_scale_std": 0.39084741473197937,
"signal/advantage_std": 0.9841934680938721,
"signal/batch_coverage_0/centered_abs_mean": 0.016065572574734687,
"signal/batch_coverage_0/group_std_mean": 0.03175957277417183,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0006328160583507269,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00022973768063820897,
"signal/batch_coverage_1/centered_abs_mean": 0.016065572574734687,
"signal/batch_coverage_1/group_std_mean": 0.03175957277417183,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0006328160583507269,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.00022973768063820897,
"signal/batch_coverage_10/centered_abs_mean": 0.031208885833621025,
"signal/batch_coverage_10/group_std_mean": 0.04865182489156723,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.00122649057302624,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.00044628706527873876,
"signal/batch_coverage_15/centered_abs_mean": 0.045175787433981894,
"signal/batch_coverage_15/group_std_mean": 0.064193245023489,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.0017749476246535778,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0006460137956310064,
"signal/batch_coverage_20/centered_abs_mean": 0.0776137426495552,
"signal/batch_coverage_20/group_std_mean": 0.10128775835037232,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.0030479800887405874,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.00110987649532035,
"signal/batch_coverage_25/centered_abs_mean": 0.10338385552167892,
"signal/batch_coverage_25/group_std_mean": 0.13086765259504318,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.00406456789933145,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0014783891616389155,
"signal/batch_coverage_5/centered_abs_mean": 0.020831802859902382,
"signal/batch_coverage_5/group_std_mean": 0.03733638674020767,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0008195297850761563,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.00029789476830046626,
"signal/brier_reward/centered_abs_mean": 0.3176029086112976,
"signal/brier_reward/group_std_mean": 0.37121009826660156,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08744428902864457,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.03176029026508331,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.23409832119941712,
"signal/confidence_uniqueness_reward/group_std_mean": 0.28709676265716555,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06445259153842926,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.023409833386540412,
"signal/format_reward/centered_abs_mean": 0.43926323652267457,
"signal/format_reward/group_std_mean": 0.47427822947502135,
"signal/format_reward/group_zero_std_frac": 0.0,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.6049425005912781,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.21963161826133729,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.4491168737411499,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4817074775695801,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.12370291203260422,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.04491168931126595,
"step": 5
},
{
"calibration/aurc": 0.516741588823447,
"calibration/batch_distribution_entropy": 0.2900443053110763,
"calibration/confidence_entropy": 0.22429980386921775,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.46433237115415676,
"calibration/mean_confidence": 0.9163211270490192,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01562500000000002,
"completions/max_length": 3979.0,
"completions/max_terminated_length": 3979.0,
"completions/mean_length": 470.5865478515625,
"completions/mean_terminated_length": 478.19724731445314,
"completions/min_length": 0.0,
"completions/min_terminated_length": 21.6,
"epoch": 0.023999700003749954,
"grad_norm": 0.006776896305382252,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.012,
"num_tokens": 17527460.0,
"reward": 0.513310170173645,
"reward_std": 0.3529686212539673,
"rewards/accuracy_reward": 0.2857638895511627,
"rewards/batch_coverage_0": 0.00610975744202733,
"rewards/batch_coverage_1": 0.00610975744202733,
"rewards/batch_coverage_10": 0.02148380195721984,
"rewards/batch_coverage_15": 0.027421478927135468,
"rewards/batch_coverage_20": 0.040733900666236875,
"rewards/batch_coverage_25": 0.0546408973634243,
"rewards/batch_coverage_5": 0.008622795436531305,
"rewards/brier_reward": 0.3533738672733307,
"rewards/confidence_uniqueness_reward": 0.364860337972641,
"rewards/format_reward": 0.7322916746139526,
"rewards/frontier_entropy_batch_reward": -0.6990230441093445,
"signal/accuracy_reward/centered_abs_mean": 0.31860893964767456,
"signal/accuracy_reward/group_std_mean": 0.38112024068832395,
"signal/accuracy_reward/group_zero_std_frac": 0.055555556900799274,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.48522502183914185,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.15930446982383728,
"signal/advantage_abs_mean": 0.8068342566490173,
"signal/advantage_pre_scale_abs_mean": 0.2906586229801178,
"signal/advantage_pre_scale_std": 0.35583515763282775,
"signal/advantage_std": 0.984165096282959,
"signal/batch_coverage_0/centered_abs_mean": 0.015117424167692661,
"signal/batch_coverage_0/group_std_mean": 0.03145041689276695,
"signal/batch_coverage_0/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0006615896127186716,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00021617915772367268,
"signal/batch_coverage_1/centered_abs_mean": 0.015117424167692661,
"signal/batch_coverage_1/group_std_mean": 0.03145041689276695,
"signal/batch_coverage_1/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0006615896127186716,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.00021617915772367268,
"signal/batch_coverage_10/centered_abs_mean": 0.022562607005238534,
"signal/batch_coverage_10/group_std_mean": 0.04156057015061378,
"signal/batch_coverage_10/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0009827240835875272,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.00032264526817016304,
"signal/batch_coverage_15/centered_abs_mean": 0.027966295555233955,
"signal/batch_coverage_15/group_std_mean": 0.04787140339612961,
"signal/batch_coverage_15/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.0012141478480771184,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0003999180218670517,
"signal/batch_coverage_20/centered_abs_mean": 0.04405505768954754,
"signal/batch_coverage_20/group_std_mean": 0.06593646556138992,
"signal/batch_coverage_20/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.001893820520490408,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0006299873115494847,
"signal/batch_coverage_25/centered_abs_mean": 0.06627024859189987,
"signal/batch_coverage_25/group_std_mean": 0.09130300730466842,
"signal/batch_coverage_25/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.0028554079122841357,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0009476645383983851,
"signal/batch_coverage_5/centered_abs_mean": 0.015587140060961246,
"signal/batch_coverage_5/group_std_mean": 0.032326587662100795,
"signal/batch_coverage_5/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0006837769004050642,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.00022289610642474146,
"signal/brier_reward/centered_abs_mean": 0.3139029681682587,
"signal/brier_reward/group_std_mean": 0.3684773623943329,
"signal/brier_reward/group_zero_std_frac": 0.002777777798473835,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09546843618154525,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.031390297785401346,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.22378431260585785,
"signal/confidence_uniqueness_reward/group_std_mean": 0.2793233871459961,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06795128434896469,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.022378432005643843,
"signal/format_reward/centered_abs_mean": 0.3380750954151154,
"signal/format_reward/group_std_mean": 0.40706380009651183,
"signal/format_reward/group_zero_std_frac": 0.008333333395421505,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.5067545115947724,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.1690375477075577,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3691741645336151,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.43443808555603025,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.11110212206840515,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03691741861402988,
"step": 10
},
{
"calibration/aurc": 0.5431273408506506,
"calibration/batch_distribution_entropy": 0.30753752651198607,
"calibration/confidence_entropy": 0.25090009054126317,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.5192780321324778,
"calibration/mean_confidence": 0.9103578212120444,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009635416666666674,
"completions/max_length": 3870.8,
"completions/max_terminated_length": 3870.8,
"completions/mean_length": 402.1271728515625,
"completions/mean_terminated_length": 406.04705810546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.8,
"epoch": 0.03599955000562493,
"grad_norm": 0.0055563561618328094,
"learning_rate": 1.7857142857142859e-06,
"loss": -0.0366,
"num_tokens": 25261949.0,
"reward": 0.6459226131439209,
"reward_std": 0.2591663062572479,
"rewards/accuracy_reward": 0.31822916865348816,
"rewards/batch_coverage_0": 0.010474585834890604,
"rewards/batch_coverage_1": 0.010474585834890604,
"rewards/batch_coverage_10": 0.025091929733753203,
"rewards/batch_coverage_15": 0.033538448810577395,
"rewards/batch_coverage_20": 0.04148237034678459,
"rewards/batch_coverage_25": 0.055399445444345476,
"rewards/batch_coverage_5": 0.01288955081254244,
"rewards/brier_reward": 0.4329886555671692,
"rewards/confidence_uniqueness_reward": 0.5421028196811676,
"rewards/format_reward": 0.9553819417953491,
"rewards/frontier_entropy_batch_reward": -0.9109981417655945,
"signal/accuracy_reward/centered_abs_mean": 0.3211480021476746,
"signal/accuracy_reward/group_std_mean": 0.38029863834381106,
"signal/accuracy_reward/group_zero_std_frac": 0.0777777798473835,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.7604451894760131,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.1605740010738373,
"signal/advantage_abs_mean": 0.7640881419181824,
"signal/advantage_pre_scale_abs_mean": 0.2078488826751709,
"signal/advantage_pre_scale_std": 0.26615193486213684,
"signal/advantage_std": 0.9839967846870422,
"signal/batch_coverage_0/centered_abs_mean": 0.02042274661362171,
"signal/batch_coverage_0/group_std_mean": 0.03724094405770302,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.001394424750469625,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00029204529710114,
"signal/batch_coverage_1/centered_abs_mean": 0.02042274661362171,
"signal/batch_coverage_1/group_std_mean": 0.03724094405770302,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.001394424750469625,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.00029204529710114,
"signal/batch_coverage_10/centered_abs_mean": 0.025325803831219674,
"signal/batch_coverage_10/group_std_mean": 0.045688331872224805,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0017356230178847908,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0003621589916292578,
"signal/batch_coverage_15/centered_abs_mean": 0.030855680257081984,
"signal/batch_coverage_15/group_std_mean": 0.05317860469222069,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.0020888552302494646,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0004412362352013588,
"signal/batch_coverage_20/centered_abs_mean": 0.03871218115091324,
"signal/batch_coverage_20/group_std_mean": 0.06303965002298355,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.0026147721568122507,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0005535841512028128,
"signal/batch_coverage_25/centered_abs_mean": 0.055917789041996,
"signal/batch_coverage_25/group_std_mean": 0.0836059644818306,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.003946993872523308,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0007996243657544255,
"signal/batch_coverage_5/centered_abs_mean": 0.02079017162322998,
"signal/batch_coverage_5/group_std_mean": 0.038049327582120894,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0014218244701623916,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.00029729946982115507,
"signal/brier_reward/centered_abs_mean": 0.29541789889335635,
"signal/brier_reward/group_std_mean": 0.3477012634277344,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.13953412771224977,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.02954179123044014,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.17255294919013978,
"signal/confidence_uniqueness_reward/group_std_mean": 0.21755547523498536,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.08172692656517029,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.01725529581308365,
"signal/format_reward/centered_abs_mean": 0.07890624776482583,
"signal/format_reward/group_std_mean": 0.15041253715753555,
"signal/format_reward/group_zero_std_frac": 0.3944444447755814,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.17634160071611404,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.03945312388241291,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.15291024446487428,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.2610882967710495,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.11666666883975267,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.07050499767065048,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.015291025303304196,
"step": 15
},
{
"calibration/aurc": 0.45601536234914103,
"calibration/batch_distribution_entropy": 0.5371427876707766,
"calibration/buffer_distribution_entropy": 0.3484450649060329,
"calibration/confidence_entropy": 0.3684050618087809,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.013123359580052493,
"calibration/coverage@25%": 0.014173228346456693,
"calibration/coverage@30%": 0.09238845144356955,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.35893722603768324,
"calibration/mean_confidence": 0.8490547748862503,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00972222222222221,
"completions/max_length": 3925.0,
"completions/max_terminated_length": 3925.0,
"completions/mean_length": 440.58714599609374,
"completions/mean_terminated_length": 444.94862060546876,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.6,
"epoch": 0.04799940000749991,
"grad_norm": 0.005572207737714052,
"learning_rate": 2.380952380952381e-06,
"loss": -0.0279,
"num_tokens": 33451209.0,
"reward": 0.7563788414001464,
"reward_std": 0.22359791100025178,
"rewards/accuracy_reward": 0.44314236640930177,
"rewards/batch_coverage_0": 0.04085179083049297,
"rewards/batch_coverage_1": 0.04085179083049297,
"rewards/batch_coverage_10": 0.09493901133537293,
"rewards/batch_coverage_15": 0.10782753825187683,
"rewards/batch_coverage_20": 0.1439335286617279,
"rewards/batch_coverage_25": 0.16521188020706176,
"rewards/batch_coverage_5": 0.05991588160395622,
"rewards/brier_reward": 0.582850182056427,
"rewards/confidence_uniqueness_reward": 0.6730621218681335,
"rewards/format_reward": 0.9874131798744201,
"rewards/frontier_entropy_batch_reward": -0.9383567333221435,
"signal/accuracy_reward/centered_abs_mean": 0.2931477904319763,
"signal/accuracy_reward/group_std_mean": 0.3582261562347412,
"signal/accuracy_reward/group_zero_std_frac": 0.09722222313284874,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.951709794998169,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.14657389521598815,
"signal/advantage_abs_mean": 0.7565471768379212,
"signal/advantage_pre_scale_abs_mean": 0.17789879739284514,
"signal/advantage_pre_scale_std": 0.23252987563610078,
"signal/advantage_std": 0.9838254809379577,
"signal/batch_coverage_0/centered_abs_mean": 0.040375912189483644,
"signal/batch_coverage_0/group_std_mean": 0.06719064265489579,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.003825485659763217,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0005773755547124892,
"signal/batch_coverage_1/centered_abs_mean": 0.040375912189483644,
"signal/batch_coverage_1/group_std_mean": 0.06719064265489579,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.003825485659763217,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0005773755547124892,
"signal/batch_coverage_10/centered_abs_mean": 0.05652825087308884,
"signal/batch_coverage_10/group_std_mean": 0.0937819629907608,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.005391606315970421,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0008083539898507297,
"signal/batch_coverage_15/centered_abs_mean": 0.06388088911771775,
"signal/batch_coverage_15/group_std_mean": 0.10372960716485977,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.00607602009549737,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0009134967462159693,
"signal/batch_coverage_20/centered_abs_mean": 0.09250121712684631,
"signal/batch_coverage_20/group_std_mean": 0.14216586351394653,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.008676209393888712,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0013227674178779125,
"signal/batch_coverage_25/centered_abs_mean": 0.1186644583940506,
"signal/batch_coverage_25/group_std_mean": 0.1730334848165512,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.011201648693531751,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.00169690172187984,
"signal/batch_coverage_5/centered_abs_mean": 0.04421483613550663,
"signal/batch_coverage_5/group_std_mean": 0.073546851426363,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.004192000767216086,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0006322721135802567,
"signal/brier_reward/centered_abs_mean": 0.24310141503810884,
"signal/brier_reward/group_std_mean": 0.296148020029068,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1566994845867157,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.024310142174363135,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.13678000569343568,
"signal/confidence_uniqueness_reward/group_std_mean": 0.16919248104095458,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0862940102815628,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.013678001426160335,
"signal/format_reward/centered_abs_mean": 0.02312825545668602,
"signal/format_reward/group_std_mean": 0.0512887679040432,
"signal/format_reward/group_zero_std_frac": 0.7611111164093017,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.07421396747231483,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01156412772834301,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.10914571136236191,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.215337872505188,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.23888888955116272,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.0710667297244072,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.010914571583271027,
"step": 20
},
{
"calibration/aurc": 0.3241197541288493,
"calibration/batch_distribution_entropy": 0.7326891440981866,
"calibration/buffer_distribution_entropy": 0.4592126091407316,
"calibration/confidence_entropy": 0.4994222128178114,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0431303389363461,
"calibration/coverage@15%": 0.0862882336731882,
"calibration/coverage@20%": 0.1499216513704103,
"calibration/coverage@25%": 0.3922831111479822,
"calibration/coverage@30%": 0.4921012428647086,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.16337910087999727,
"calibration/mean_confidence": 0.7518155108340762,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010503472222222232,
"completions/max_length": 3511.6,
"completions/max_terminated_length": 3511.6,
"completions/mean_length": 522.8118041992187,
"completions/mean_terminated_length": 528.3630310058594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.2,
"epoch": 0.05999925000937488,
"grad_norm": 0.0037775952368974686,
"learning_rate": 2.9761904761904763e-06,
"loss": -0.0267,
"num_tokens": 42598449.0,
"reward": 0.8629345178604126,
"reward_std": 0.20029426813125611,
"rewards/accuracy_reward": 0.5723958373069763,
"rewards/batch_coverage_0": 0.08039978817105294,
"rewards/batch_coverage_1": 0.08039978817105294,
"rewards/batch_coverage_10": 0.17986115962266921,
"rewards/batch_coverage_15": 0.1960737258195877,
"rewards/batch_coverage_20": 0.21463773548603057,
"rewards/batch_coverage_25": 0.23300228416919708,
"rewards/batch_coverage_5": 0.12557416558265685,
"rewards/brier_reward": 0.7168434143066407,
"rewards/confidence_uniqueness_reward": 0.7757420778274536,
"rewards/format_reward": 0.9881076335906982,
"rewards/frontier_entropy_batch_reward": -0.8244805216789246,
"signal/accuracy_reward/centered_abs_mean": 0.2552517354488373,
"signal/accuracy_reward/group_std_mean": 0.32020156383514403,
"signal/accuracy_reward/group_zero_std_frac": 0.1555555582046509,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9001852035522461,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.12762586772441864,
"signal/advantage_abs_mean": 0.7349266171455383,
"signal/advantage_pre_scale_abs_mean": 0.15444318056106568,
"signal/advantage_pre_scale_std": 0.21327590942382812,
"signal/advantage_std": 0.9837662577629089,
"signal/batch_coverage_0/centered_abs_mean": 0.06751533597707748,
"signal/batch_coverage_0/group_std_mean": 0.09494247138500214,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.006873792223632336,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0009654692490585149,
"signal/batch_coverage_1/centered_abs_mean": 0.06751533597707748,
"signal/batch_coverage_1/group_std_mean": 0.09494247138500214,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.006873792223632336,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0009654692490585149,
"signal/batch_coverage_10/centered_abs_mean": 0.09968591183423996,
"signal/batch_coverage_10/group_std_mean": 0.14351416528224945,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.010167979542165995,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0014255085028707982,
"signal/batch_coverage_15/centered_abs_mean": 0.10670627057552337,
"signal/batch_coverage_15/group_std_mean": 0.15375309437513351,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01088944710791111,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0015258996281772852,
"signal/batch_coverage_20/centered_abs_mean": 0.12048476487398148,
"signal/batch_coverage_20/group_std_mean": 0.1724111407995224,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.012245130911469459,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0017229320714250208,
"signal/batch_coverage_25/centered_abs_mean": 0.13844509273767472,
"signal/batch_coverage_25/group_std_mean": 0.19501933455467224,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.01405192855745554,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.001979764853604138,
"signal/batch_coverage_5/centered_abs_mean": 0.07986659109592438,
"signal/batch_coverage_5/group_std_mean": 0.11289454102516175,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.00815016021952033,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0011420922819525003,
"signal/brier_reward/centered_abs_mean": 0.1660325288772583,
"signal/brier_reward/group_std_mean": 0.21059280633926392,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.11725894808769226,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016603253595530987,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.12287691384553909,
"signal/confidence_uniqueness_reward/group_std_mean": 0.14854514896869658,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.08736316412687302,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.012287691608071328,
"signal/format_reward/centered_abs_mean": 0.02112087681889534,
"signal/format_reward/group_std_mean": 0.03975438773632049,
"signal/format_reward/group_zero_std_frac": 0.8416666746139526,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0743426114320755,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01056043840944767,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.25322854369878767,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3676407068967819,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.07500000298023224,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.18085478246212006,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.025322853401303292,
"step": 25
},
{
"calibration/aurc": 0.27955515653207896,
"calibration/batch_distribution_entropy": 0.9364864588621179,
"calibration/buffer_distribution_entropy": 0.5970358735855067,
"calibration/confidence_entropy": 0.5301070483821728,
"calibration/coverage@0%": 0.009701449275362319,
"calibration/coverage@1%": 0.009701449275362319,
"calibration/coverage@10%": 0.015085507246376811,
"calibration/coverage@15%": 0.06695507246376811,
"calibration/coverage@20%": 0.1699693094629156,
"calibration/coverage@25%": 0.34508087266527165,
"calibration/coverage@30%": 0.711757724689686,
"calibration/coverage@5%": 0.009701449275362319,
"calibration/ece": 0.17340574561883063,
"calibration/mean_confidence": 0.6036285913200597,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.014583333333333327,
"completions/max_length": 3165.4,
"completions/max_terminated_length": 3165.4,
"completions/mean_length": 613.1007080078125,
"completions/mean_terminated_length": 622.1764770507813,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.4,
"epoch": 0.07199910001124986,
"grad_norm": 0.0026056095957756042,
"learning_rate": 3.5714285714285718e-06,
"loss": -0.0293,
"num_tokens": 52771289.0,
"reward": 0.9426739931106567,
"reward_std": 0.18256551921367645,
"rewards/accuracy_reward": 0.60546875,
"rewards/batch_coverage_0": 0.11707074046134949,
"rewards/batch_coverage_1": 0.11707074046134949,
"rewards/batch_coverage_10": 0.19703109860420226,
"rewards/batch_coverage_15": 0.21080294847488404,
"rewards/batch_coverage_20": 0.2307416707277298,
"rewards/batch_coverage_25": 0.23985148668289186,
"rewards/batch_coverage_5": 0.15007564574480056,
"rewards/brier_reward": 0.7217867493629455,
"rewards/confidence_uniqueness_reward": 0.9331858634948731,
"rewards/format_reward": 0.9842013955116272,
"rewards/frontier_entropy_batch_reward": -0.3571417719125748,
"signal/accuracy_reward/centered_abs_mean": 0.22742512822151184,
"signal/accuracy_reward/group_std_mean": 0.286023223400116,
"signal/accuracy_reward/group_zero_std_frac": 0.2361111134290695,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8609166383743286,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.11371256411075592,
"signal/advantage_abs_mean": 0.7514987826347351,
"signal/advantage_pre_scale_abs_mean": 0.1399205207824707,
"signal/advantage_pre_scale_std": 0.19905700385570527,
"signal/advantage_std": 0.9837118506431579,
"signal/batch_coverage_0/centered_abs_mean": 0.17423317432403565,
"signal/batch_coverage_0/group_std_mean": 0.22805773317813874,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.01937145460397005,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0024915344547480345,
"signal/batch_coverage_1/centered_abs_mean": 0.17423317432403565,
"signal/batch_coverage_1/group_std_mean": 0.22805773317813874,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.01937145460397005,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0024915344547480345,
"signal/batch_coverage_10/centered_abs_mean": 0.1992947429418564,
"signal/batch_coverage_10/group_std_mean": 0.25936625003814695,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.022066234052181243,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.00284991473890841,
"signal/batch_coverage_15/centered_abs_mean": 0.201465904712677,
"signal/batch_coverage_15/group_std_mean": 0.2615888833999634,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.022288778424263002,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.00288096247240901,
"signal/batch_coverage_20/centered_abs_mean": 0.21147489845752715,
"signal/batch_coverage_20/group_std_mean": 0.2743402898311615,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02333966102451086,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.003024091012775898,
"signal/batch_coverage_25/centered_abs_mean": 0.2154662013053894,
"signal/batch_coverage_25/group_std_mean": 0.2790635824203491,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.02376740947365761,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0030811666045337917,
"signal/batch_coverage_5/centered_abs_mean": 0.18447383344173432,
"signal/batch_coverage_5/group_std_mean": 0.24043427407741547,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.020505336113274097,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0026379758259281517,
"signal/brier_reward/centered_abs_mean": 0.19326081573963166,
"signal/brier_reward/group_std_mean": 0.2406743735074997,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1484305217862129,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01932608112692833,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03934527114033699,
"signal/confidence_uniqueness_reward/group_std_mean": 0.06451441496610641,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.029453575611114502,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003934527095407247,
"signal/format_reward/centered_abs_mean": 0.02638888843357563,
"signal/format_reward/group_std_mean": 0.04935496896505356,
"signal/format_reward/group_zero_std_frac": 0.7972222328186035,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0981840431690216,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.013194444216787815,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3794820189476013,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4454668164253235,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.28796843588352206,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03794820159673691,
"step": 30
},
{
"calibration/aurc": 0.24226920367701021,
"calibration/batch_distribution_entropy": 0.9581315958770873,
"calibration/buffer_distribution_entropy": 0.7127915712755234,
"calibration/confidence_entropy": 0.4551993687512953,
"calibration/coverage@0%": 0.009962875385701033,
"calibration/coverage@1%": 0.009962875385701033,
"calibration/coverage@10%": 0.04031613014830156,
"calibration/coverage@15%": 0.19086098051820694,
"calibration/coverage@20%": 0.3238800048827573,
"calibration/coverage@25%": 0.6257067655588412,
"calibration/coverage@30%": 0.8198433420365536,
"calibration/coverage@5%": 0.01675138713504829,
"calibration/ece": 0.2122458842995219,
"calibration/mean_confidence": 0.5957768150519883,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.018055555555555557,
"completions/max_length": 3879.2,
"completions/max_terminated_length": 3879.2,
"completions/mean_length": 649.6018310546875,
"completions/mean_terminated_length": 661.6557495117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.6,
"epoch": 0.08399895001312484,
"grad_norm": 0.002304090652614832,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0483,
"num_tokens": 63332142.0,
"reward": 0.9694664359092713,
"reward_std": 0.17354933619499208,
"rewards/accuracy_reward": 0.6381944417953491,
"rewards/batch_coverage_0": 0.15102842450141907,
"rewards/batch_coverage_1": 0.15102842450141907,
"rewards/batch_coverage_10": 0.2521698772907257,
"rewards/batch_coverage_15": 0.2756618678569794,
"rewards/batch_coverage_20": 0.2928420454263687,
"rewards/batch_coverage_25": 0.3004423320293427,
"rewards/batch_coverage_5": 0.20541033744812012,
"rewards/brier_reward": 0.7120538473129272,
"rewards/confidence_uniqueness_reward": 0.9275844931602478,
"rewards/format_reward": 0.9814236044883728,
"rewards/frontier_entropy_batch_reward": -0.27595201730728147,
"signal/accuracy_reward/centered_abs_mean": 0.19836153984069824,
"signal/accuracy_reward/group_std_mean": 0.25724024176597593,
"signal/accuracy_reward/group_zero_std_frac": 0.2888888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8888693213462829,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09918076992034912,
"signal/advantage_abs_mean": 0.7374646306037903,
"signal/advantage_pre_scale_abs_mean": 0.12931651920080184,
"signal/advantage_pre_scale_std": 0.1932118058204651,
"signal/advantage_std": 0.983579683303833,
"signal/batch_coverage_0/centered_abs_mean": 0.22375875115394592,
"signal/batch_coverage_0/group_std_mean": 0.28625078201293946,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.02871074862778187,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00319975009188056,
"signal/batch_coverage_1/centered_abs_mean": 0.22375875115394592,
"signal/batch_coverage_1/group_std_mean": 0.28625078201293946,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.02871074862778187,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.00319975009188056,
"signal/batch_coverage_10/centered_abs_mean": 0.24917038083076476,
"signal/batch_coverage_10/group_std_mean": 0.3143121063709259,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03190204724669456,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.003563136560842395,
"signal/batch_coverage_15/centered_abs_mean": 0.25848992466926574,
"signal/batch_coverage_15/group_std_mean": 0.32483440041542055,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.033208982273936274,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.00369640588760376,
"signal/batch_coverage_20/centered_abs_mean": 0.2676124572753906,
"signal/batch_coverage_20/group_std_mean": 0.3357407510280609,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03440100736916065,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.00382685805670917,
"signal/batch_coverage_25/centered_abs_mean": 0.2681353986263275,
"signal/batch_coverage_25/group_std_mean": 0.3352727711200714,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03452103175222874,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.003834336344152689,
"signal/batch_coverage_5/centered_abs_mean": 0.23883576095104217,
"signal/batch_coverage_5/group_std_mean": 0.3027981579303741,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03062332086265087,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0034153513144701718,
"signal/brier_reward/centered_abs_mean": 0.23339413106441498,
"signal/brier_reward/group_std_mean": 0.2831204056739807,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20949692130088807,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.023339413478970526,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04882469177246094,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07911910861730576,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04447423368692398,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004882469354197383,
"signal/format_reward/centered_abs_mean": 0.03072916641831398,
"signal/format_reward/group_std_mean": 0.05717795491218567,
"signal/format_reward/group_zero_std_frac": 0.7638888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.1347813993692398,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01536458320915699,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3622328042984009,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.43260804414749143,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.3302638798952103,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.036223283410072325,
"step": 35
},
{
"calibration/aurc": 0.2562137876806586,
"calibration/batch_distribution_entropy": 0.938866150096856,
"calibration/buffer_distribution_entropy": 0.7625007518494724,
"calibration/confidence_entropy": 0.48723283405102336,
"calibration/coverage@0%": 0.005274540682414698,
"calibration/coverage@1%": 0.005274540682414698,
"calibration/coverage@10%": 0.026074540682414698,
"calibration/coverage@15%": 0.0667716535433071,
"calibration/coverage@20%": 0.3302887139107612,
"calibration/coverage@25%": 0.5345121572569387,
"calibration/coverage@30%": 0.690819009325962,
"calibration/coverage@5%": 0.005274540682414698,
"calibration/ece": 0.17700430434619707,
"calibration/mean_confidence": 0.615831670720631,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01519097222222221,
"completions/max_length": 3865.4,
"completions/max_terminated_length": 3865.4,
"completions/mean_length": 663.9608642578125,
"completions/mean_terminated_length": 674.2513427734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 179.6,
"epoch": 0.09599880001499982,
"grad_norm": 0.0024075538385659456,
"learning_rate": 4.761904761904762e-06,
"loss": -0.0471,
"num_tokens": 74100491.0,
"reward": 0.9821699857711792,
"reward_std": 0.16510820984840394,
"rewards/accuracy_reward": 0.6545138955116272,
"rewards/batch_coverage_0": 0.18086645603179932,
"rewards/batch_coverage_1": 0.18086645603179932,
"rewards/batch_coverage_10": 0.2809440016746521,
"rewards/batch_coverage_15": 0.29933875799179077,
"rewards/batch_coverage_20": 0.3136002361774445,
"rewards/batch_coverage_25": 0.3249046623706818,
"rewards/batch_coverage_5": 0.23657204806804658,
"rewards/brier_reward": 0.7423569560050964,
"rewards/confidence_uniqueness_reward": 0.9304731726646424,
"rewards/format_reward": 0.9846354126930237,
"rewards/frontier_entropy_batch_reward": -0.30672123432159426,
"signal/accuracy_reward/centered_abs_mean": 0.1842881917953491,
"signal/accuracy_reward/group_std_mean": 0.24733619391918182,
"signal/accuracy_reward/group_zero_std_frac": 0.28888889849185945,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8925943851470948,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09214409589767455,
"signal/advantage_abs_mean": 0.7237564325332642,
"signal/advantage_pre_scale_abs_mean": 0.12071978598833084,
"signal/advantage_pre_scale_std": 0.18614649176597595,
"signal/advantage_std": 0.9835057854652405,
"signal/batch_coverage_0/centered_abs_mean": 0.19749594628810882,
"signal/batch_coverage_0/group_std_mean": 0.2580998420715332,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.027659310027956963,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0028241920750588178,
"signal/batch_coverage_1/centered_abs_mean": 0.19749594628810882,
"signal/batch_coverage_1/group_std_mean": 0.2580998420715332,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.027659310027956963,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0028241920750588178,
"signal/batch_coverage_10/centered_abs_mean": 0.2273343801498413,
"signal/batch_coverage_10/group_std_mean": 0.29257852435112,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03175325132906437,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.003250881750136614,
"signal/batch_coverage_15/centered_abs_mean": 0.2346246600151062,
"signal/batch_coverage_15/group_std_mean": 0.3009405076503754,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03282949589192867,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.003355132741853595,
"signal/batch_coverage_20/centered_abs_mean": 0.24077640175819398,
"signal/batch_coverage_20/group_std_mean": 0.307919579744339,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.033719856292009354,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.003443102678284049,
"signal/batch_coverage_25/centered_abs_mean": 0.24552258849143982,
"signal/batch_coverage_25/group_std_mean": 0.31313495635986327,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.034370492398738864,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0035109729506075382,
"signal/batch_coverage_5/centered_abs_mean": 0.2139654278755188,
"signal/batch_coverage_5/group_std_mean": 0.2763186156749725,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.02990221492946148,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.00305970567278564,
"signal/brier_reward/centered_abs_mean": 0.2017700344324112,
"signal/brier_reward/group_std_mean": 0.25099734365940096,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1979275941848755,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.020177004113793372,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04290001392364502,
"signal/confidence_uniqueness_reward/group_std_mean": 0.06807700842618943,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04234274849295616,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004290001420304179,
"signal/format_reward/centered_abs_mean": 0.02630750872194767,
"signal/format_reward/group_std_mean": 0.04854954332113266,
"signal/format_reward/group_zero_std_frac": 0.8027778029441833,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.12644327580928802,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.013153754360973834,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.37426209449768066,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4400992035865784,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.36859245896339415,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.037426209449768065,
"step": 40
},
{
"calibration/aurc": 0.18634576408880263,
"calibration/batch_distribution_entropy": 0.9119223904372665,
"calibration/buffer_distribution_entropy": 0.802208760829016,
"calibration/confidence_entropy": 0.4703543439728567,
"calibration/coverage@0%": 0.010027763944112675,
"calibration/coverage@1%": 0.010027763944112675,
"calibration/coverage@10%": 0.08765064263197066,
"calibration/coverage@15%": 0.27478578523672303,
"calibration/coverage@20%": 0.7562270949253943,
"calibration/coverage@25%": 0.9463833675094918,
"calibration/coverage@30%": 1.0,
"calibration/coverage@5%": 0.028304526346201446,
"calibration/ece": 0.13231561302728326,
"calibration/mean_confidence": 0.6588306208595136,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.019357638888888907,
"completions/max_length": 3533.2,
"completions/max_terminated_length": 3533.2,
"completions/mean_length": 666.63681640625,
"completions/mean_terminated_length": 679.785791015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.4,
"epoch": 0.1079986500168748,
"grad_norm": 0.0027716143522411585,
"learning_rate": 4.909638554216868e-06,
"loss": -0.0528,
"num_tokens": 84915411.0,
"reward": 0.9768375635147095,
"reward_std": 0.16755106151103974,
"rewards/accuracy_reward": 0.6585069417953491,
"rewards/batch_coverage_0": 0.20247699320316315,
"rewards/batch_coverage_1": 0.20247699320316315,
"rewards/batch_coverage_10": 0.30868937373161315,
"rewards/batch_coverage_15": 0.3283854365348816,
"rewards/batch_coverage_20": 0.34293708205223083,
"rewards/batch_coverage_25": 0.3498722016811371,
"rewards/batch_coverage_5": 0.2585438132286072,
"rewards/brier_reward": 0.7502263784408569,
"rewards/confidence_uniqueness_reward": 0.9214388489723205,
"rewards/format_reward": 0.9802951455116272,
"rewards/frontier_entropy_batch_reward": -0.382354199886322,
"signal/accuracy_reward/centered_abs_mean": 0.18536241352558136,
"signal/accuracy_reward/group_std_mean": 0.24444203674793244,
"signal/accuracy_reward/group_zero_std_frac": 0.30555556416511537,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9499078631401062,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09268120676279068,
"signal/advantage_abs_mean": 0.7305899024009704,
"signal/advantage_pre_scale_abs_mean": 0.12448228895664215,
"signal/advantage_pre_scale_std": 0.19205167889595032,
"signal/advantage_std": 0.9834497570991516,
"signal/batch_coverage_0/centered_abs_mean": 0.18731668293476106,
"signal/batch_coverage_0/group_std_mean": 0.25060613453388214,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.027264073491096497,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0026786285918205975,
"signal/batch_coverage_1/centered_abs_mean": 0.18731668293476106,
"signal/batch_coverage_1/group_std_mean": 0.25060613453388214,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.027264073491096497,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0026786285918205975,
"signal/batch_coverage_10/centered_abs_mean": 0.22031475007534027,
"signal/batch_coverage_10/group_std_mean": 0.291497939825058,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03219587206840515,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0031505009159445764,
"signal/batch_coverage_15/centered_abs_mean": 0.22723614275455475,
"signal/batch_coverage_15/group_std_mean": 0.29923227429389954,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.033339572697877885,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0032494768034666776,
"signal/batch_coverage_20/centered_abs_mean": 0.23478021323680878,
"signal/batch_coverage_20/group_std_mean": 0.30784116983413695,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.0344635047018528,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0033573569264262913,
"signal/batch_coverage_25/centered_abs_mean": 0.24096759259700776,
"signal/batch_coverage_25/group_std_mean": 0.3143567681312561,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03543527238070965,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0034458366688340904,
"signal/batch_coverage_5/centered_abs_mean": 0.2049874782562256,
"signal/batch_coverage_5/group_std_mean": 0.2722942590713501,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.029881338775157928,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0029313208535313605,
"signal/brier_reward/centered_abs_mean": 0.19711728096008302,
"signal/brier_reward/group_std_mean": 0.24602427780628205,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20185837149620056,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.019711730256676673,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04846069440245628,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07283977195620536,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05067523345351219,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004846069682389498,
"signal/format_reward/centered_abs_mean": 0.03095160573720932,
"signal/format_reward/group_std_mean": 0.052476833760738376,
"signal/format_reward/group_zero_std_frac": 0.8027777910232544,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.16123647689819337,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01547580286860466,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.39955584406852723,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4637088716030121,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41296189427375796,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.039955586194992065,
"step": 45
},
{
"calibration/aurc": 0.3653812543339875,
"calibration/batch_distribution_entropy": 0.9548606722830435,
"calibration/buffer_distribution_entropy": 0.8285173590046464,
"calibration/confidence_entropy": 0.4608581055976243,
"calibration/coverage@0%": 0.006972364490113031,
"calibration/coverage@1%": 0.006972364490113031,
"calibration/coverage@10%": 0.006972364490113031,
"calibration/coverage@15%": 0.006972364490113031,
"calibration/coverage@20%": 0.06575440902969278,
"calibration/coverage@25%": 0.2739157656296435,
"calibration/coverage@30%": 0.3610937358143598,
"calibration/coverage@5%": 0.006972364490113031,
"calibration/ece": 0.19754022596238457,
"calibration/mean_confidence": 0.5781760003123844,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.020138888888888883,
"completions/max_length": 3450.4,
"completions/max_terminated_length": 3450.4,
"completions/mean_length": 702.4233520507812,
"completions/mean_terminated_length": 716.8848022460937,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.4,
"epoch": 0.11999850001874976,
"grad_norm": 0.0027710706926882267,
"learning_rate": 4.759036144578314e-06,
"loss": -0.0563,
"num_tokens": 96104928.0,
"reward": 0.9751698017120362,
"reward_std": 0.1612878292798996,
"rewards/accuracy_reward": 0.6449652671813965,
"rewards/batch_coverage_0": 0.22631794810295106,
"rewards/batch_coverage_1": 0.22631794810295106,
"rewards/batch_coverage_10": 0.3063993453979492,
"rewards/batch_coverage_15": 0.32116289138793946,
"rewards/batch_coverage_20": 0.335969477891922,
"rewards/batch_coverage_25": 0.3402317225933075,
"rewards/batch_coverage_5": 0.2677540272474289,
"rewards/brier_reward": 0.7310540556907654,
"rewards/confidence_uniqueness_reward": 0.9254271507263183,
"rewards/format_reward": 0.9791666626930237,
"rewards/frontier_entropy_batch_reward": -0.31489698886871337,
"signal/accuracy_reward/centered_abs_mean": 0.1730685740709305,
"signal/accuracy_reward/group_std_mean": 0.22485636174678802,
"signal/accuracy_reward/group_zero_std_frac": 0.37500000596046446,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9425517797470093,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08653428703546524,
"signal/advantage_abs_mean": 0.7427342653274536,
"signal/advantage_pre_scale_abs_mean": 0.12028573453426361,
"signal/advantage_pre_scale_std": 0.18794769048690796,
"signal/advantage_std": 0.9833948612213135,
"signal/batch_coverage_0/centered_abs_mean": 0.20805572867393493,
"signal/batch_coverage_0/group_std_mean": 0.2680402934551239,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.032111577689647675,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0029751969035714866,
"signal/batch_coverage_1/centered_abs_mean": 0.20805572867393493,
"signal/batch_coverage_1/group_std_mean": 0.2680402934551239,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.032111577689647675,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0029751969035714866,
"signal/batch_coverage_10/centered_abs_mean": 0.23025363981723784,
"signal/batch_coverage_10/group_std_mean": 0.29348798990249636,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.035579219460487366,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.003292627027258277,
"signal/batch_coverage_15/centered_abs_mean": 0.23427854776382445,
"signal/batch_coverage_15/group_std_mean": 0.29840022325515747,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03621581345796585,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.003350183181464672,
"signal/batch_coverage_20/centered_abs_mean": 0.24115452468395232,
"signal/batch_coverage_20/group_std_mean": 0.3071499466896057,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03732352778315544,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0034485096111893654,
"signal/batch_coverage_25/centered_abs_mean": 0.24156874120235444,
"signal/batch_coverage_25/group_std_mean": 0.30771451592445376,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03742283582687378,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0034544329158961774,
"signal/batch_coverage_5/centered_abs_mean": 0.2209177404642105,
"signal/batch_coverage_5/group_std_mean": 0.2820799022912979,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.034112075716257094,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0031591235660016538,
"signal/brier_reward/centered_abs_mean": 0.20708931982517242,
"signal/brier_reward/group_std_mean": 0.2561024874448776,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.22439938485622407,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.020708932355046273,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04774502441287041,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07586067467927933,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05263256207108498,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004774502851068974,
"signal/format_reward/centered_abs_mean": 0.03421224020421505,
"signal/format_reward/group_std_mean": 0.060324309021234514,
"signal/format_reward/group_zero_std_frac": 0.7666666626930236,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.18888653069734573,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.017106120102107526,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3588499128818512,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.42804943919181826,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.3920546770095825,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03588499128818512,
"step": 50
},
{
"epoch": 0.11999850001874976,
"eval_calibration/aurc": 0.2501744326434537,
"eval_calibration/batch_distribution_entropy": 0.8360048238344385,
"eval_calibration/buffer_distribution_entropy": 0.8410969664986397,
"eval_calibration/confidence_entropy": 0.5071609062611472,
"eval_calibration/coverage@0%": 0.1490255376344086,
"eval_calibration/coverage@1%": 0.1490255376344086,
"eval_calibration/coverage@10%": 0.20732526881720428,
"eval_calibration/coverage@15%": 0.36710349462365593,
"eval_calibration/coverage@20%": 0.44690860215053757,
"eval_calibration/coverage@25%": 0.5913978494623656,
"eval_calibration/coverage@30%": 0.7911626344086021,
"eval_calibration/coverage@5%": 0.1490255376344086,
"eval_calibration/ece": 0.2197660028658773,
"eval_calibration/mean_confidence": 0.6813664740767263,
"eval_completions/clipped_ratio": 0.02760416666666669,
"eval_completions/max_length": 2184.3333333333335,
"eval_completions/max_terminated_length": 2184.3333333333335,
"eval_completions/mean_length": 685.631103515625,
"eval_completions/mean_terminated_length": 704.9729817708334,
"eval_completions/min_length": 0.0,
"eval_completions/min_terminated_length": 257.1666666666667,
"eval_loss": 0.0,
"eval_num_tokens": 96104928.0,
"eval_reward": 0.8819248378276825,
"eval_reward_std": 0.281004066268603,
"eval_rewards/accuracy_reward": 0.6545138855775198,
"eval_rewards/batch_coverage_0": 0.005231775964299838,
"eval_rewards/batch_coverage_1": 0.005231775964299838,
"eval_rewards/batch_coverage_10": 0.015199587369958559,
"eval_rewards/batch_coverage_15": 0.03218171435097853,
"eval_rewards/batch_coverage_20": 0.08130176406727212,
"eval_rewards/batch_coverage_25": 0.11032332324733336,
"eval_rewards/batch_coverage_5": 0.005231775964299838,
"eval_rewards/brier_reward": 0.7671490510304769,
"eval_rewards/confidence_uniqueness_reward": 0.8576907813549042,
"eval_rewards/format_reward": 0.9713541766007742,
"eval_rewards/frontier_entropy_batch_reward": -0.9713541766007742,
"eval_runtime": 212.1586,
"eval_samples_per_second": 4.713,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4370659738779068,
"eval_signal/accuracy_reward/group_std_mean": 0.473753089706103,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.7891835272312164,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2185329869389534,
"eval_signal/advantage_abs_mean": 0.8707231283187866,
"eval_signal/advantage_pre_scale_abs_mean": 0.24482331921656927,
"eval_signal/advantage_pre_scale_std": 0.27929239471753436,
"eval_signal/advantage_std": 0.9864674607912699,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.11891523251930873,
"eval_signal/batch_coverage_0/group_std_mean": 0.18859038750330606,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.00614167214371264,
"eval_signal/batch_coverage_0/weight": 0.014299999922513962,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.0017004877833339076,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.11891523251930873,
"eval_signal/batch_coverage_1/group_std_mean": 0.18859038750330606,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.00614167214371264,
"eval_signal/batch_coverage_1/weight": 0.014299999922513962,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.0017004877833339076,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.10200798759857814,
"eval_signal/batch_coverage_10/group_std_mean": 0.15644440179069838,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.005255104896302025,
"eval_signal/batch_coverage_10/weight": 0.014299999922513962,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.0014587142504751682,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.1067725345492363,
"eval_signal/batch_coverage_15/group_std_mean": 0.15877163410186768,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.005520270516475041,
"eval_signal/batch_coverage_15/weight": 0.014299999922513962,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.001526847171286742,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.1338308664659659,
"eval_signal/batch_coverage_20/group_std_mean": 0.18896765261888504,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.006970916486655672,
"eval_signal/batch_coverage_20/weight": 0.014299999922513962,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.00191378133604303,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.15927699456612268,
"eval_signal/batch_coverage_25/group_std_mean": 0.2141042004028956,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.008288138701270023,
"eval_signal/batch_coverage_25/weight": 0.014299999922513962,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.002277661037320892,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.11891523251930873,
"eval_signal/batch_coverage_5/group_std_mean": 0.18859038750330606,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.00614167214371264,
"eval_signal/batch_coverage_5/weight": 0.014299999922513962,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.0017004877833339076,
"eval_signal/brier_reward/centered_abs_mean": 0.2128459538022677,
"eval_signal/brier_reward/group_std_mean": 0.27014947930971783,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07686047628521919,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.02128459544231494,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.07862282978991668,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.14259813353419304,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.028277449930707615,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.00786228283929328,
"eval_signal/format_reward/centered_abs_mean": 0.054307724349200726,
"eval_signal/format_reward/group_std_mean": 0.1340879499912262,
"eval_signal/format_reward/group_zero_std_frac": 0.3333333432674408,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.09734798781573772,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.027153862174600363,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.054307724349200726,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.1340879499912262,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.3333333432674408,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.019469597997764748,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.005430772861776252,
"eval_steps_per_second": 0.028,
"step": 50
},
{
"calibration/aurc": 0.2666057163044536,
"calibration/batch_distribution_entropy": 0.9119153030575751,
"calibration/buffer_distribution_entropy": 0.8477103549632282,
"calibration/confidence_entropy": 0.5351193885844949,
"calibration/coverage@0%": 0.008132351368401148,
"calibration/coverage@1%": 0.008132351368401148,
"calibration/coverage@10%": 0.08364484499410232,
"calibration/coverage@15%": 0.17373565065897462,
"calibration/coverage@20%": 0.303553484435861,
"calibration/coverage@25%": 0.5082058493847523,
"calibration/coverage@30%": 0.5936117061911805,
"calibration/coverage@5%": 0.008132351368401148,
"calibration/ece": 0.13232012055932668,
"calibration/mean_confidence": 0.6377276248857202,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.030729166666666696,
"completions/max_length": 3737.2,
"completions/max_terminated_length": 3737.2,
"completions/mean_length": 740.2117309570312,
"completions/mean_terminated_length": 764.018017578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.4,
"epoch": 0.13199835002062474,
"grad_norm": 0.002537653548642993,
"learning_rate": 4.60843373493976e-06,
"loss": -0.0787,
"num_tokens": 107712743.0,
"reward": 0.9625433802604675,
"reward_std": 0.1768051266670227,
"rewards/accuracy_reward": 0.6413194537162781,
"rewards/batch_coverage_0": 0.19119429290294648,
"rewards/batch_coverage_1": 0.19119429290294648,
"rewards/batch_coverage_10": 0.27686830759048464,
"rewards/batch_coverage_15": 0.29585487246513364,
"rewards/batch_coverage_20": 0.3131400167942047,
"rewards/batch_coverage_25": 0.31867790818214414,
"rewards/batch_coverage_5": 0.23185440599918367,
"rewards/brier_reward": 0.7527095675468445,
"rewards/confidence_uniqueness_reward": 0.9140000581741333,
"rewards/format_reward": 0.9682291626930237,
"rewards/frontier_entropy_batch_reward": -0.3491052746772766,
"signal/accuracy_reward/centered_abs_mean": 0.17724609076976777,
"signal/accuracy_reward/group_std_mean": 0.23526398837566376,
"signal/accuracy_reward/group_zero_std_frac": 0.33333333730697634,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8493547916412354,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08862304538488389,
"signal/advantage_abs_mean": 0.7360433101654053,
"signal/advantage_pre_scale_abs_mean": 0.13109518587589264,
"signal/advantage_pre_scale_std": 0.2053077608346939,
"signal/advantage_std": 0.9835181951522827,
"signal/batch_coverage_0/centered_abs_mean": 0.15323663353919983,
"signal/batch_coverage_0/group_std_mean": 0.19757155179977418,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.020985569804906845,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00219128392636776,
"signal/batch_coverage_1/centered_abs_mean": 0.15323663353919983,
"signal/batch_coverage_1/group_std_mean": 0.19757155179977418,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.020985569804906845,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.00219128392636776,
"signal/batch_coverage_10/centered_abs_mean": 0.18423721194267273,
"signal/batch_coverage_10/group_std_mean": 0.23655245900154115,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.025256452709436418,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0026345920283347367,
"signal/batch_coverage_15/centered_abs_mean": 0.19146005511283876,
"signal/batch_coverage_15/group_std_mean": 0.24579738676548005,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.026244521141052246,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.00273787877522409,
"signal/batch_coverage_20/centered_abs_mean": 0.20497694611549377,
"signal/batch_coverage_20/group_std_mean": 0.2633979916572571,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.028137539327144623,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002931170305237174,
"signal/batch_coverage_25/centered_abs_mean": 0.20725022852420807,
"signal/batch_coverage_25/group_std_mean": 0.266075587272644,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.028393303230404853,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0029636782128363848,
"signal/batch_coverage_5/centered_abs_mean": 0.16661872565746308,
"signal/batch_coverage_5/group_std_mean": 0.21373461782932282,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.022796943411231042,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023826478514820336,
"signal/brier_reward/centered_abs_mean": 0.16861002445220946,
"signal/brier_reward/group_std_mean": 0.21469865143299102,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16197701394557953,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016861002892255783,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.05775104984641075,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09017271548509598,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.054496601969003675,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005775105208158493,
"signal/format_reward/centered_abs_mean": 0.04748263955116272,
"signal/format_reward/group_std_mean": 0.07838348224759102,
"signal/format_reward/group_zero_std_frac": 0.7166666746139526,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.22152430117130278,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.02374131977558136,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3691326975822449,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4334746658802032,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.35583109855651857,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.036913270503282546,
"step": 55
},
{
"calibration/aurc": 0.2805250612184601,
"calibration/batch_distribution_entropy": 0.9082584323337837,
"calibration/buffer_distribution_entropy": 0.8582812627318781,
"calibration/confidence_entropy": 0.46065529544521266,
"calibration/coverage@0%": 0.012173174872665535,
"calibration/coverage@1%": 0.012173174872665535,
"calibration/coverage@10%": 0.12241878890775326,
"calibration/coverage@15%": 0.3207108092812677,
"calibration/coverage@20%": 0.45481607243916244,
"calibration/coverage@25%": 0.5905705952225476,
"calibration/coverage@30%": 0.6680951759050231,
"calibration/coverage@5%": 0.01638370118845501,
"calibration/ece": 0.15464922341745774,
"calibration/mean_confidence": 0.6475694502184807,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04088541666666665,
"completions/max_length": 3870.8,
"completions/max_terminated_length": 3870.8,
"completions/mean_length": 796.7504272460938,
"completions/mean_terminated_length": 831.1199584960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.14399820002249972,
"grad_norm": 0.002131366403773427,
"learning_rate": 4.457831325301205e-06,
"loss": -0.1066,
"num_tokens": 119987884.0,
"reward": 0.9501850128173828,
"reward_std": 0.19018920361995698,
"rewards/accuracy_reward": 0.6277777791023255,
"rewards/batch_coverage_0": 0.22274880409240722,
"rewards/batch_coverage_1": 0.22274880409240722,
"rewards/batch_coverage_10": 0.3174858093261719,
"rewards/batch_coverage_15": 0.3349512219429016,
"rewards/batch_coverage_20": 0.3502999782562256,
"rewards/batch_coverage_25": 0.3587311267852783,
"rewards/batch_coverage_5": 0.273887038230896,
"rewards/brier_reward": 0.7444848537445068,
"rewards/confidence_uniqueness_reward": 0.9000389218330384,
"rewards/format_reward": 0.9586805701255798,
"rewards/frontier_entropy_batch_reward": -0.3725272506475449,
"signal/accuracy_reward/centered_abs_mean": 0.18524305522441864,
"signal/accuracy_reward/group_std_mean": 0.24164635241031646,
"signal/accuracy_reward/group_zero_std_frac": 0.32500000298023224,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9821744084358215,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09262152761220932,
"signal/advantage_abs_mean": 0.720962119102478,
"signal/advantage_pre_scale_abs_mean": 0.1397200971841812,
"signal/advantage_pre_scale_std": 0.22300408482551576,
"signal/advantage_std": 0.9834036588668823,
"signal/batch_coverage_0/centered_abs_mean": 0.17097190916538238,
"signal/batch_coverage_0/group_std_mean": 0.2232629120349884,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.02595260217785835,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0024448982439935207,
"signal/batch_coverage_1/centered_abs_mean": 0.17097190916538238,
"signal/batch_coverage_1/group_std_mean": 0.2232629120349884,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.02595260217785835,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0024448982439935207,
"signal/batch_coverage_10/centered_abs_mean": 0.20441849529743195,
"signal/batch_coverage_10/group_std_mean": 0.2626152366399765,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.031098613888025282,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002923184493556619,
"signal/batch_coverage_15/centered_abs_mean": 0.21226610839366913,
"signal/batch_coverage_15/group_std_mean": 0.2717501163482666,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03228338472545147,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0030354054179042578,
"signal/batch_coverage_20/centered_abs_mean": 0.21970059275627135,
"signal/batch_coverage_20/group_std_mean": 0.2806521713733673,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.033462189882993695,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.003141718450933695,
"signal/batch_coverage_25/centered_abs_mean": 0.22630649209022521,
"signal/batch_coverage_25/group_std_mean": 0.2887511670589447,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03441586978733539,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.003236182779073715,
"signal/batch_coverage_5/centered_abs_mean": 0.18746646940708162,
"signal/batch_coverage_5/group_std_mean": 0.24161474406719208,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.02843271866440773,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0026807705871760845,
"signal/brier_reward/centered_abs_mean": 0.1914803385734558,
"signal/brier_reward/group_std_mean": 0.23936934769153595,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.2029225081205368,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01914803497493267,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07778193056583405,
"signal/confidence_uniqueness_reward/group_std_mean": 0.12012100070714951,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.08290282860398293,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007778193149715662,
"signal/format_reward/centered_abs_mean": 0.0645724818110466,
"signal/format_reward/group_std_mean": 0.10592493116855621,
"signal/format_reward/group_zero_std_frac": 0.6166666626930237,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.33684843182563784,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0322862409055233,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3714078009128571,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.43968183994293214,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.40729886293411255,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03714078143239021,
"step": 60
},
{
"calibration/aurc": 0.21271399086090503,
"calibration/batch_distribution_entropy": 0.968225044162411,
"calibration/buffer_distribution_entropy": 0.873241567117009,
"calibration/confidence_entropy": 0.46644094116885915,
"calibration/coverage@0%": 0.015098270514390218,
"calibration/coverage@1%": 0.015098270514390218,
"calibration/coverage@10%": 0.2748608190038695,
"calibration/coverage@15%": 0.5670735095002177,
"calibration/coverage@20%": 0.6353687146386167,
"calibration/coverage@25%": 0.7044765332732366,
"calibration/coverage@30%": 0.752890016419304,
"calibration/coverage@5%": 0.09382167476970939,
"calibration/ece": 0.16446162124530828,
"calibration/mean_confidence": 0.5560946029883173,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05295138888888891,
"completions/max_length": 3882.2,
"completions/max_terminated_length": 3882.2,
"completions/mean_length": 815.523095703125,
"completions/mean_terminated_length": 860.9260864257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.2,
"epoch": 0.1559980500243747,
"grad_norm": 0.0024373617488890886,
"learning_rate": 4.307228915662651e-06,
"loss": -0.1167,
"num_tokens": 132476758.0,
"reward": 0.9634868264198303,
"reward_std": 0.1869693487882614,
"rewards/accuracy_reward": 0.6431423544883728,
"rewards/batch_coverage_0": 0.23204412460327148,
"rewards/batch_coverage_1": 0.23204412460327148,
"rewards/batch_coverage_10": 0.32365984916687013,
"rewards/batch_coverage_15": 0.3395088195800781,
"rewards/batch_coverage_20": 0.35158597230911254,
"rewards/batch_coverage_25": 0.35531420111656187,
"rewards/batch_coverage_5": 0.28531578183174133,
"rewards/brier_reward": 0.7312113642692566,
"rewards/confidence_uniqueness_reward": 0.8997768282890319,
"rewards/format_reward": 0.946875,
"rewards/frontier_entropy_batch_reward": -0.24929134249687196,
"signal/accuracy_reward/centered_abs_mean": 0.16935221254825591,
"signal/accuracy_reward/group_std_mean": 0.22745457291603088,
"signal/accuracy_reward/group_zero_std_frac": 0.33611111640930175,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8086487889289856,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08467610627412796,
"signal/advantage_abs_mean": 0.7164683222770691,
"signal/advantage_pre_scale_abs_mean": 0.13654183447360993,
"signal/advantage_pre_scale_std": 0.21922028064727783,
"signal/advantage_std": 0.9835149884223938,
"signal/batch_coverage_0/centered_abs_mean": 0.20915255844593048,
"signal/batch_coverage_0/group_std_mean": 0.2628923296928406,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.028789347037672997,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002990881586447358,
"signal/batch_coverage_1/centered_abs_mean": 0.20915255844593048,
"signal/batch_coverage_1/group_std_mean": 0.2628923296928406,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.028789347037672997,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002990881586447358,
"signal/batch_coverage_10/centered_abs_mean": 0.23952043652534485,
"signal/batch_coverage_10/group_std_mean": 0.2989070534706116,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0329642117023468,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0034251420758664606,
"signal/batch_coverage_15/centered_abs_mean": 0.24125888645648957,
"signal/batch_coverage_15/group_std_mean": 0.30165609121322634,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03321785070002079,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.003450001962482929,
"signal/batch_coverage_20/centered_abs_mean": 0.2430189371109009,
"signal/batch_coverage_20/group_std_mean": 0.30439882874488833,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03352968730032444,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.003475170908495784,
"signal/batch_coverage_25/centered_abs_mean": 0.24278749525547028,
"signal/batch_coverage_25/group_std_mean": 0.3051342725753784,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03350915126502514,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.003471861220896244,
"signal/batch_coverage_5/centered_abs_mean": 0.22716282606124877,
"signal/batch_coverage_5/group_std_mean": 0.28373380899429324,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03121436983346939,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.003248428227379918,
"signal/brier_reward/centered_abs_mean": 0.21119480431079865,
"signal/brier_reward/group_std_mean": 0.2605059534311295,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20226334631443024,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.021119481325149535,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08284200727939606,
"signal/confidence_uniqueness_reward/group_std_mean": 0.12240892052650451,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07839244678616523,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008284200355410576,
"signal/format_reward/centered_abs_mean": 0.07634548544883728,
"signal/format_reward/group_std_mean": 0.1152060568332672,
"signal/format_reward/group_zero_std_frac": 0.6111111164093017,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.3589187800884247,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.03817274272441864,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32150029540061953,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39567927122116087,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.3117268800735474,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03215003050863743,
"step": 65
},
{
"calibration/aurc": 0.24774531330265148,
"calibration/batch_distribution_entropy": 0.9211471777051056,
"calibration/buffer_distribution_entropy": 0.8814297626198936,
"calibration/confidence_entropy": 0.4404385253711066,
"calibration/coverage@0%": 0.01250161932596551,
"calibration/coverage@1%": 0.01250161932596551,
"calibration/coverage@10%": 0.13770487135848583,
"calibration/coverage@15%": 0.15402542453616247,
"calibration/coverage@20%": 0.28338366903393925,
"calibration/coverage@25%": 0.5766635713041601,
"calibration/coverage@30%": 0.8610831903159772,
"calibration/coverage@5%": 0.08025229683274057,
"calibration/ece": 0.14467948400989514,
"calibration/mean_confidence": 0.6279437049412072,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03663194444444444,
"completions/max_length": 3646.8,
"completions/max_terminated_length": 3646.8,
"completions/mean_length": 834.01416015625,
"completions/mean_terminated_length": 865.7296020507813,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.16799790002624967,
"grad_norm": 0.0021353804040700197,
"learning_rate": 4.156626506024097e-06,
"loss": -0.0899,
"num_tokens": 145162745.0,
"reward": 0.9690884709358215,
"reward_std": 0.17554756104946137,
"rewards/accuracy_reward": 0.6421006917953491,
"rewards/batch_coverage_0": 0.26493396162986754,
"rewards/batch_coverage_1": 0.26493396162986754,
"rewards/batch_coverage_10": 0.35353497266769407,
"rewards/batch_coverage_15": 0.36810287833213806,
"rewards/batch_coverage_20": 0.3775683999061584,
"rewards/batch_coverage_25": 0.38541075587272644,
"rewards/batch_coverage_5": 0.3158003121614456,
"rewards/brier_reward": 0.7648000240325927,
"rewards/confidence_uniqueness_reward": 0.9073068022727966,
"rewards/format_reward": 0.9630208253860474,
"rewards/frontier_entropy_batch_reward": -0.3400608658790588,
"signal/accuracy_reward/centered_abs_mean": 0.1756781667470932,
"signal/accuracy_reward/group_std_mean": 0.22818847894668579,
"signal/accuracy_reward/group_zero_std_frac": 0.3583333373069763,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9614694237709045,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0878390833735466,
"signal/advantage_abs_mean": 0.7173182845115662,
"signal/advantage_pre_scale_abs_mean": 0.12758750170469285,
"signal/advantage_pre_scale_std": 0.2087983638048172,
"signal/advantage_std": 0.9833806037902832,
"signal/batch_coverage_0/centered_abs_mean": 0.18425578474998475,
"signal/batch_coverage_0/group_std_mean": 0.23661755323410033,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.02887213006615639,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002634857688099146,
"signal/batch_coverage_1/centered_abs_mean": 0.18425578474998475,
"signal/batch_coverage_1/group_std_mean": 0.23661755323410033,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.02887213006615639,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002634857688099146,
"signal/batch_coverage_10/centered_abs_mean": 0.20975691378116607,
"signal/batch_coverage_10/group_std_mean": 0.2687097519636154,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0329612348228693,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0029995237942785024,
"signal/batch_coverage_15/centered_abs_mean": 0.2140837788581848,
"signal/batch_coverage_15/group_std_mean": 0.27389532923698423,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03371351547539234,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.00306139811873436,
"signal/batch_coverage_20/centered_abs_mean": 0.21710529327392578,
"signal/batch_coverage_20/group_std_mean": 0.27775133550167086,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03412764519453049,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.003104605805128813,
"signal/batch_coverage_25/centered_abs_mean": 0.2247564196586609,
"signal/batch_coverage_25/group_std_mean": 0.2869294822216034,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03532760851085186,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0032140168827027083,
"signal/batch_coverage_5/centered_abs_mean": 0.19823941290378572,
"signal/batch_coverage_5/group_std_mean": 0.25322408974170685,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.031146563962101935,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002834823727607727,
"signal/brier_reward/centered_abs_mean": 0.17921520471572877,
"signal/brier_reward/group_std_mean": 0.2279728651046753,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19661661088466645,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01792152039706707,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06926488056778908,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10625757575035095,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07679094523191451,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006926488224416971,
"signal/format_reward/centered_abs_mean": 0.05738932266831398,
"signal/format_reward/group_std_mean": 0.09339729994535446,
"signal/format_reward/group_zero_std_frac": 0.6638888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.316864949464798,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.02869466133415699,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.35653844475746155,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.426321929693222,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.39354650378227235,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.035653845965862276,
"step": 70
},
{
"calibration/aurc": 0.1860572061793848,
"calibration/batch_distribution_entropy": 0.9246497199562427,
"calibration/buffer_distribution_entropy": 0.8883004335264812,
"calibration/confidence_entropy": 0.46967255991625756,
"calibration/coverage@0%": 0.05187684511478731,
"calibration/coverage@1%": 0.08415197738991957,
"calibration/coverage@10%": 0.3801901627673189,
"calibration/coverage@15%": 0.4447886261393833,
"calibration/coverage@20%": 0.5134962946195415,
"calibration/coverage@25%": 0.6749903386901988,
"calibration/coverage@30%": 0.7981028883421204,
"calibration/coverage@5%": 0.19222920355225212,
"calibration/ece": 0.13536141780022604,
"calibration/mean_confidence": 0.6490086093658063,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017534722222222254,
"completions/max_length": 3834.0,
"completions/max_terminated_length": 3834.0,
"completions/mean_length": 795.0524291992188,
"completions/mean_terminated_length": 809.184228515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 244.4,
"epoch": 0.17999775002812465,
"grad_norm": 0.002331721130758524,
"learning_rate": 4.006024096385543e-06,
"loss": -0.0434,
"num_tokens": 157386645.0,
"reward": 1.0091155409812926,
"reward_std": 0.1520775854587555,
"rewards/accuracy_reward": 0.690625011920929,
"rewards/batch_coverage_0": 0.2843721121549606,
"rewards/batch_coverage_1": 0.2843721121549606,
"rewards/batch_coverage_10": 0.37667989134788515,
"rewards/batch_coverage_15": 0.3934926211833954,
"rewards/batch_coverage_20": 0.4086416959762573,
"rewards/batch_coverage_25": 0.41325275897979735,
"rewards/batch_coverage_5": 0.3411641776561737,
"rewards/brier_reward": 0.8053735375404358,
"rewards/confidence_uniqueness_reward": 0.9250438809394836,
"rewards/format_reward": 0.9811631917953492,
"rewards/frontier_entropy_batch_reward": -0.3559852600097656,
"signal/accuracy_reward/centered_abs_mean": 0.16431206464767456,
"signal/accuracy_reward/group_std_mean": 0.21887060701847078,
"signal/accuracy_reward/group_zero_std_frac": 0.37222222089767454,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9913081049919128,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08215603232383728,
"signal/advantage_abs_mean": 0.7215629577636719,
"signal/advantage_pre_scale_abs_mean": 0.10871622711420059,
"signal/advantage_pre_scale_std": 0.18051161766052246,
"signal/advantage_std": 0.9832531571388244,
"signal/batch_coverage_0/centered_abs_mean": 0.16481218934059144,
"signal/batch_coverage_0/group_std_mean": 0.21140751242637634,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.028905288875102998,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002356814406812191,
"signal/batch_coverage_1/centered_abs_mean": 0.16481218934059144,
"signal/batch_coverage_1/group_std_mean": 0.21140751242637634,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.028905288875102998,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002356814406812191,
"signal/batch_coverage_10/centered_abs_mean": 0.19273822903633117,
"signal/batch_coverage_10/group_std_mean": 0.24738204181194307,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03368869572877884,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0027561566326767205,
"signal/batch_coverage_15/centered_abs_mean": 0.2002232104539871,
"signal/batch_coverage_15/group_std_mean": 0.257281693816185,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.0350172333419323,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0028631918597966433,
"signal/batch_coverage_20/centered_abs_mean": 0.20743354558944702,
"signal/batch_coverage_20/group_std_mean": 0.26647299230098725,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03612685203552246,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0029662997461855412,
"signal/batch_coverage_25/centered_abs_mean": 0.21094645261764527,
"signal/batch_coverage_25/group_std_mean": 0.2706577003002167,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.0366845540702343,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.003016534401103854,
"signal/batch_coverage_5/centered_abs_mean": 0.1806482791900635,
"signal/batch_coverage_5/group_std_mean": 0.23169384598731996,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03167664371430874,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.00258327042683959,
"signal/brier_reward/centered_abs_mean": 0.14588031470775603,
"signal/brier_reward/group_std_mean": 0.18990064263343812,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17611936926841737,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.014588031731545926,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.044456960260868074,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07398699522018433,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05278183743357658,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004445695923641324,
"signal/format_reward/centered_abs_mean": 0.03200412318110466,
"signal/format_reward/group_std_mean": 0.05958279184997082,
"signal/format_reward/group_zero_std_frac": 0.7555555701255798,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.18332911729812623,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01600206159055233,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34321765303611756,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4119159996509552,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.42253615260124205,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03432176485657692,
"step": 75
},
{
"calibration/aurc": 0.15278728997151786,
"calibration/batch_distribution_entropy": 0.9201143720375663,
"calibration/buffer_distribution_entropy": 0.8935675122889043,
"calibration/confidence_entropy": 0.4637714482116243,
"calibration/coverage@0%": 0.047810804511804375,
"calibration/coverage@1%": 0.047810804511804375,
"calibration/coverage@10%": 0.39640807567155434,
"calibration/coverage@15%": 0.6296719106720096,
"calibration/coverage@20%": 0.7230256765216778,
"calibration/coverage@25%": 0.7885582527268078,
"calibration/coverage@30%": 0.9187234042553192,
"calibration/coverage@5%": 0.11938975188022542,
"calibration/ece": 0.14607550692186017,
"calibration/mean_confidence": 0.6258542466649399,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012152777777777768,
"completions/max_length": 3363.8,
"completions/max_terminated_length": 3363.8,
"completions/mean_length": 798.2267333984375,
"completions/mean_terminated_length": 808.1140991210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.8,
"epoch": 0.19199760002999963,
"grad_norm": 0.002594166435301304,
"learning_rate": 3.855421686746989e-06,
"loss": -0.0318,
"num_tokens": 169635497.0,
"reward": 0.9984848141670227,
"reward_std": 0.13936654329299927,
"rewards/accuracy_reward": 0.6553819417953491,
"rewards/batch_coverage_0": 0.2774128675460815,
"rewards/batch_coverage_1": 0.2774128675460815,
"rewards/batch_coverage_10": 0.3563702404499054,
"rewards/batch_coverage_15": 0.36783955097198484,
"rewards/batch_coverage_20": 0.37556777000427244,
"rewards/batch_coverage_25": 0.38054537773132324,
"rewards/batch_coverage_5": 0.3182225406169891,
"rewards/brier_reward": 0.7844858169555664,
"rewards/confidence_uniqueness_reward": 0.9357450246810913,
"rewards/format_reward": 0.9878472208976745,
"rewards/frontier_entropy_batch_reward": -0.2880606114864349,
"signal/accuracy_reward/centered_abs_mean": 0.1596137136220932,
"signal/accuracy_reward/group_std_mean": 0.21180415749549866,
"signal/accuracy_reward/group_zero_std_frac": 0.397222226858139,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.999223780632019,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0798068568110466,
"signal/advantage_abs_mean": 0.7337562799453735,
"signal/advantage_pre_scale_abs_mean": 0.10226666629314422,
"signal/advantage_pre_scale_std": 0.1641099452972412,
"signal/advantage_std": 0.9832229495048523,
"signal/batch_coverage_0/centered_abs_mean": 0.16477072536945342,
"signal/batch_coverage_0/group_std_mean": 0.20871671438217163,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.029762042686343193,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002356221526861191,
"signal/batch_coverage_1/centered_abs_mean": 0.16477072536945342,
"signal/batch_coverage_1/group_std_mean": 0.20871671438217163,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.029762042686343193,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002356221526861191,
"signal/batch_coverage_10/centered_abs_mean": 0.19158050715923308,
"signal/batch_coverage_10/group_std_mean": 0.2435964047908783,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.034541353583335876,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002739601256325841,
"signal/batch_coverage_15/centered_abs_mean": 0.19723239839076995,
"signal/batch_coverage_15/group_std_mean": 0.2512341469526291,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.035590323805809024,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0028204232454299926,
"signal/batch_coverage_20/centered_abs_mean": 0.19890411496162413,
"signal/batch_coverage_20/group_std_mean": 0.2538034111261368,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03589809164404869,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0028443288523703814,
"signal/batch_coverage_25/centered_abs_mean": 0.19924717247486115,
"signal/batch_coverage_25/group_std_mean": 0.2548013597726822,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03595203086733818,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0028492347337305546,
"signal/batch_coverage_5/centered_abs_mean": 0.1787838190793991,
"signal/batch_coverage_5/group_std_mean": 0.22594253718852997,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03230985403060913,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002556608524173498,
"signal/brier_reward/centered_abs_mean": 0.1500013828277588,
"signal/brier_reward/group_std_mean": 0.1918517142534256,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18809423148632048,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.015000138618052005,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.03316798135638237,
"signal/confidence_uniqueness_reward/group_std_mean": 0.053910914808511734,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.042143101990222934,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0033167982008308174,
"signal/format_reward/centered_abs_mean": 0.020323350466787814,
"signal/format_reward/group_std_mean": 0.0388909000903368,
"signal/format_reward/group_zero_std_frac": 0.8361111164093018,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.1274551644921303,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.010161675233393907,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3206637859344482,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39140368103981016,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4064549207687378,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.032066379114985465,
"step": 80
},
{
"calibration/aurc": 0.18923391547567533,
"calibration/batch_distribution_entropy": 0.9520473330985391,
"calibration/buffer_distribution_entropy": 0.9007541310204402,
"calibration/confidence_entropy": 0.454584405636062,
"calibration/coverage@0%": 0.019984941894130602,
"calibration/coverage@1%": 0.019984941894130602,
"calibration/coverage@10%": 0.2399654196333001,
"calibration/coverage@15%": 0.4899862593203633,
"calibration/coverage@20%": 0.6405708575230216,
"calibration/coverage@25%": 0.76561468810309,
"calibration/coverage@30%": 0.8572303871781062,
"calibration/coverage@5%": 0.0599849418941306,
"calibration/ece": 0.1479991190866681,
"calibration/mean_confidence": 0.5452007696982675,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3134.6,
"completions/max_terminated_length": 3134.6,
"completions/mean_length": 805.275439453125,
"completions/mean_terminated_length": 811.61259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 290.2,
"epoch": 0.2039974500318746,
"grad_norm": 0.00243806722573936,
"learning_rate": 3.7048192771084342e-06,
"loss": -0.0182,
"num_tokens": 181999470.0,
"reward": 1.0227416276931762,
"reward_std": 0.13126814365386963,
"rewards/accuracy_reward": 0.693836796283722,
"rewards/batch_coverage_0": 0.30402875542640684,
"rewards/batch_coverage_1": 0.30402875542640684,
"rewards/batch_coverage_10": 0.3916003882884979,
"rewards/batch_coverage_15": 0.4001737177371979,
"rewards/batch_coverage_20": 0.41051369309425356,
"rewards/batch_coverage_25": 0.4160007297992706,
"rewards/batch_coverage_5": 0.3573697626590729,
"rewards/brier_reward": 0.8003751873970032,
"rewards/confidence_uniqueness_reward": 0.9386812210083008,
"rewards/format_reward": 0.9921006798744202,
"rewards/frontier_entropy_batch_reward": -0.3107993364334106,
"signal/accuracy_reward/centered_abs_mean": 0.15584309697151183,
"signal/accuracy_reward/group_std_mean": 0.20595036149024964,
"signal/accuracy_reward/group_zero_std_frac": 0.4111111104488373,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0481150150299072,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07792154848575591,
"signal/advantage_abs_mean": 0.7339847207069397,
"signal/advantage_pre_scale_abs_mean": 0.09553650766611099,
"signal/advantage_pre_scale_std": 0.15589506924152374,
"signal/advantage_std": 0.9831393837928772,
"signal/batch_coverage_0/centered_abs_mean": 0.1720154494047165,
"signal/batch_coverage_0/group_std_mean": 0.21767355501651764,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03309999741613865,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002459820918738842,
"signal/batch_coverage_1/centered_abs_mean": 0.1720154494047165,
"signal/batch_coverage_1/group_std_mean": 0.21767355501651764,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03309999741613865,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002459820918738842,
"signal/batch_coverage_10/centered_abs_mean": 0.19574157893657684,
"signal/batch_coverage_10/group_std_mean": 0.25071599781513215,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03761949762701988,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002799104666337371,
"signal/batch_coverage_15/centered_abs_mean": 0.19882267713546753,
"signal/batch_coverage_15/group_std_mean": 0.25518170595169065,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03819100335240364,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0028431642800569533,
"signal/batch_coverage_20/centered_abs_mean": 0.199673655629158,
"signal/batch_coverage_20/group_std_mean": 0.257070392370224,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03838499188423157,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002855333220213652,
"signal/batch_coverage_25/centered_abs_mean": 0.20092089772224425,
"signal/batch_coverage_25/group_std_mean": 0.2598212480545044,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03866703435778618,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0028731688857078553,
"signal/batch_coverage_5/centered_abs_mean": 0.18711472749710084,
"signal/batch_coverage_5/group_std_mean": 0.23794482946395873,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03597569689154625,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002675740560516715,
"signal/brier_reward/centered_abs_mean": 0.14387299716472626,
"signal/brier_reward/group_std_mean": 0.18577905595302582,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19340074360370635,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.014387300051748753,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02960009016096592,
"signal/confidence_uniqueness_reward/group_std_mean": 0.049013516306877135,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03989330157637596,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002960009081289172,
"signal/format_reward/centered_abs_mean": 0.014480251632630826,
"signal/format_reward/group_std_mean": 0.030678948760032652,
"signal/format_reward/group_zero_std_frac": 0.8638888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.09725062102079392,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007240125816315413,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.338201642036438,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4092584550380707,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4551249384880066,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033820164203643796,
"step": 85
},
{
"calibration/aurc": 0.11841251098462335,
"calibration/batch_distribution_entropy": 0.9437270099405835,
"calibration/buffer_distribution_entropy": 0.9070971011642726,
"calibration/confidence_entropy": 0.47455616824898056,
"calibration/coverage@0%": 0.04532894736842106,
"calibration/coverage@1%": 0.07501644736842104,
"calibration/coverage@10%": 0.528530701754386,
"calibration/coverage@15%": 0.7434020014581308,
"calibration/coverage@20%": 0.8073761890709623,
"calibration/coverage@25%": 0.8629222359163078,
"calibration/coverage@30%": 0.9403157981530343,
"calibration/coverage@5%": 0.43197916666666664,
"calibration/ece": 0.14672207193323844,
"calibration/mean_confidence": 0.6095787607729249,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009722222222222233,
"completions/max_length": 3009.0,
"completions/max_terminated_length": 3009.0,
"completions/mean_length": 798.0718872070313,
"completions/mean_terminated_length": 805.9511962890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 291.0,
"epoch": 0.2159973000337496,
"grad_norm": 0.0024022439029067755,
"learning_rate": 3.5542168674698798e-06,
"loss": -0.0292,
"num_tokens": 194261930.0,
"reward": 1.012345314025879,
"reward_std": 0.1382793515920639,
"rewards/accuracy_reward": 0.67734375,
"rewards/batch_coverage_0": 0.29151966571807864,
"rewards/batch_coverage_1": 0.29151966571807864,
"rewards/batch_coverage_10": 0.37335585355758666,
"rewards/batch_coverage_15": 0.38456701040267943,
"rewards/batch_coverage_20": 0.3926276683807373,
"rewards/batch_coverage_25": 0.3991153180599213,
"rewards/batch_coverage_5": 0.3377767264842987,
"rewards/brier_reward": 0.8084167242050171,
"rewards/confidence_uniqueness_reward": 0.9369056940078735,
"rewards/format_reward": 0.9902777791023254,
"rewards/frontier_entropy_batch_reward": -0.31325617134571077,
"signal/accuracy_reward/centered_abs_mean": 0.15619032382965087,
"signal/accuracy_reward/group_std_mean": 0.20633386969566345,
"signal/accuracy_reward/group_zero_std_frac": 0.4083333373069763,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9890505075454712,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07809516191482543,
"signal/advantage_abs_mean": 0.722357201576233,
"signal/advantage_pre_scale_abs_mean": 0.09861443638801574,
"signal/advantage_pre_scale_std": 0.16254321336746216,
"signal/advantage_std": 0.9832135915756226,
"signal/batch_coverage_0/centered_abs_mean": 0.15663588047027588,
"signal/batch_coverage_0/group_std_mean": 0.19704360365867615,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.028397080302238465,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002239893050864339,
"signal/batch_coverage_1/centered_abs_mean": 0.15663588047027588,
"signal/batch_coverage_1/group_std_mean": 0.19704360365867615,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.028397080302238465,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002239893050864339,
"signal/batch_coverage_10/centered_abs_mean": 0.18144900798797609,
"signal/batch_coverage_10/group_std_mean": 0.23120782077312468,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03289467468857765,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0025947208516299723,
"signal/batch_coverage_15/centered_abs_mean": 0.1861711025238037,
"signal/batch_coverage_15/group_std_mean": 0.23703703582286834,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03376954942941666,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0026622468139976263,
"signal/batch_coverage_20/centered_abs_mean": 0.1893145114183426,
"signal/batch_coverage_20/group_std_mean": 0.24125303626060485,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.034355881810188296,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0027071974240243436,
"signal/batch_coverage_25/centered_abs_mean": 0.1931730628013611,
"signal/batch_coverage_25/group_std_mean": 0.24652757942676545,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.035058844834566116,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0027623747009783983,
"signal/batch_coverage_5/centered_abs_mean": 0.16939508318901061,
"signal/batch_coverage_5/group_std_mean": 0.21401880979537963,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03073953352868557,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002422349713742733,
"signal/brier_reward/centered_abs_mean": 0.13135956376791,
"signal/brier_reward/group_std_mean": 0.17388156652450562,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16675288379192352,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013135956414043904,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.030612779781222343,
"signal/confidence_uniqueness_reward/group_std_mean": 0.056104399263858795,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.039064832404255866,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003061278164386749,
"signal/format_reward/centered_abs_mean": 0.01807725727558136,
"signal/format_reward/group_std_mean": 0.04161466956138611,
"signal/format_reward/group_zero_std_frac": 0.8027777910232544,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.115253297239542,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00903862863779068,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32865068316459656,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3971480667591095,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41861586570739745,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03286506943404675,
"step": 90
},
{
"calibration/aurc": 0.15553885133255266,
"calibration/batch_distribution_entropy": 0.9405537018543049,
"calibration/buffer_distribution_entropy": 0.9123583877756813,
"calibration/confidence_entropy": 0.42898815958161557,
"calibration/coverage@0%": 0.09475057832516563,
"calibration/coverage@1%": 0.09956341254976456,
"calibration/coverage@10%": 0.5655464917805556,
"calibration/coverage@15%": 0.6369851395020707,
"calibration/coverage@20%": 0.7021939915679865,
"calibration/coverage@25%": 0.7455078374465705,
"calibration/coverage@30%": 0.7778327401207218,
"calibration/coverage@5%": 0.3528750501873545,
"calibration/ece": 0.19682903494839277,
"calibration/mean_confidence": 0.5376710652957325,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017274305555555557,
"completions/max_length": 3323.4,
"completions/max_terminated_length": 3323.4,
"completions/mean_length": 806.6776245117187,
"completions/mean_terminated_length": 820.83466796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 282.4,
"epoch": 0.22799715003562457,
"grad_norm": 0.002317988546565175,
"learning_rate": 3.4036144578313257e-06,
"loss": -0.0587,
"num_tokens": 206646536.0,
"reward": 1.0056054592132568,
"reward_std": 0.15680947303771972,
"rewards/accuracy_reward": 0.6745659708976746,
"rewards/batch_coverage_0": 0.319716477394104,
"rewards/batch_coverage_1": 0.319716477394104,
"rewards/batch_coverage_10": 0.3897965490818024,
"rewards/batch_coverage_15": 0.3973939657211304,
"rewards/batch_coverage_20": 0.40343857407569883,
"rewards/batch_coverage_25": 0.4100212812423706,
"rewards/batch_coverage_5": 0.3576392471790314,
"rewards/brier_reward": 0.7860231637954712,
"rewards/confidence_uniqueness_reward": 0.929762089252472,
"rewards/format_reward": 0.9826388835906983,
"rewards/frontier_entropy_batch_reward": -0.3172291338443756,
"signal/accuracy_reward/centered_abs_mean": 0.16064995527267456,
"signal/accuracy_reward/group_std_mean": 0.21915687918663024,
"signal/accuracy_reward/group_zero_std_frac": 0.34166666865348816,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9114587306976318,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08032497763633728,
"signal/advantage_abs_mean": 0.6914363980293274,
"signal/advantage_pre_scale_abs_mean": 0.10503824055194855,
"signal/advantage_pre_scale_std": 0.18097075819969177,
"signal/advantage_std": 0.9833443641662598,
"signal/batch_coverage_0/centered_abs_mean": 0.16990404427051545,
"signal/batch_coverage_0/group_std_mean": 0.21895833909511567,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.027632010728120805,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0024296278599649666,
"signal/batch_coverage_1/centered_abs_mean": 0.16990404427051545,
"signal/batch_coverage_1/group_std_mean": 0.21895833909511567,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.027632010728120805,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0024296278599649666,
"signal/batch_coverage_10/centered_abs_mean": 0.1890587478876114,
"signal/batch_coverage_10/group_std_mean": 0.24543083608150482,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.030750004574656487,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0027035401202738286,
"signal/batch_coverage_15/centered_abs_mean": 0.19028227627277375,
"signal/batch_coverage_15/group_std_mean": 0.2467927187681198,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.030991899222135542,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002721036598086357,
"signal/batch_coverage_20/centered_abs_mean": 0.19405905902385712,
"signal/batch_coverage_20/group_std_mean": 0.25165227949619295,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.031617505475878716,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0027750445529818534,
"signal/batch_coverage_25/centered_abs_mean": 0.20035355389118195,
"signal/batch_coverage_25/group_std_mean": 0.2593338280916214,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.032659551873803136,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0028650558553636072,
"signal/batch_coverage_5/centered_abs_mean": 0.1818935126066208,
"signal/batch_coverage_5/group_std_mean": 0.23420885503292083,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.029556793347001075,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002601077314466238,
"signal/brier_reward/centered_abs_mean": 0.14850833714008332,
"signal/brier_reward/group_std_mean": 0.19897804260253907,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16746804118156433,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01485083345323801,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.042133938521146774,
"signal/confidence_uniqueness_reward/group_std_mean": 0.08468769639730453,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04754840657114982,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004213394038379193,
"signal/format_reward/centered_abs_mean": 0.03225911408662796,
"signal/format_reward/group_std_mean": 0.07445819452404975,
"signal/format_reward/group_zero_std_frac": 0.6444444537162781,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.18054555654525756,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01612955704331398,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33631141781806945,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40545159578323364,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.3834416627883911,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03363114222884178,
"step": 95
},
{
"calibration/aurc": 0.14822290368263727,
"calibration/batch_distribution_entropy": 0.9337155038720034,
"calibration/buffer_distribution_entropy": 0.9163495073770178,
"calibration/confidence_entropy": 0.4845056722594919,
"calibration/coverage@0%": 0.045044248599582566,
"calibration/coverage@1%": 0.045044248599582566,
"calibration/coverage@10%": 0.34236437978774104,
"calibration/coverage@15%": 0.6615942316378179,
"calibration/coverage@20%": 0.8370272412600379,
"calibration/coverage@25%": 0.9098417593922203,
"calibration/coverage@30%": 0.9337582321192357,
"calibration/coverage@5%": 0.08942943043126889,
"calibration/ece": 0.1282733814835266,
"calibration/mean_confidence": 0.6290357321007473,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.019791666666666673,
"completions/max_length": 3316.6,
"completions/max_terminated_length": 3316.6,
"completions/mean_length": 801.7419311523438,
"completions/mean_terminated_length": 817.86298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.6,
"epoch": 0.23999700003749952,
"grad_norm": 0.002241980517283082,
"learning_rate": 3.2530120481927713e-06,
"loss": -0.0585,
"num_tokens": 218981675.0,
"reward": 1.0059626221656799,
"reward_std": 0.15443489253520964,
"rewards/accuracy_reward": 0.675,
"rewards/batch_coverage_0": 0.33700525760650635,
"rewards/batch_coverage_1": 0.33700525760650635,
"rewards/batch_coverage_10": 0.40210442543029784,
"rewards/batch_coverage_15": 0.41211907267570497,
"rewards/batch_coverage_20": 0.42381772994995115,
"rewards/batch_coverage_25": 0.4277949512004852,
"rewards/batch_coverage_5": 0.3827322542667389,
"rewards/brier_reward": 0.8183276414871216,
"rewards/confidence_uniqueness_reward": 0.9244850397109985,
"rewards/format_reward": 0.9797743201255799,
"rewards/frontier_entropy_batch_reward": -0.34638661742210386,
"signal/accuracy_reward/centered_abs_mean": 0.14997829794883727,
"signal/accuracy_reward/group_std_mean": 0.20479413270950317,
"signal/accuracy_reward/group_zero_std_frac": 0.38333333730697633,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8926918745040894,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07498914897441863,
"signal/advantage_abs_mean": 0.69511878490448,
"signal/advantage_pre_scale_abs_mean": 0.10425726771354675,
"signal/advantage_pre_scale_std": 0.17962579131126405,
"signal/advantage_std": 0.9832849621772766,
"signal/batch_coverage_0/centered_abs_mean": 0.14159564077854156,
"signal/batch_coverage_0/group_std_mean": 0.18152335584163665,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.024330893903970717,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002024817722849548,
"signal/batch_coverage_1/centered_abs_mean": 0.14159564077854156,
"signal/batch_coverage_1/group_std_mean": 0.18152335584163665,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.024330893903970717,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002024817722849548,
"signal/batch_coverage_10/centered_abs_mean": 0.16167503893375396,
"signal/batch_coverage_10/group_std_mean": 0.2084078311920166,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.027843480557203294,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002311953017488122,
"signal/batch_coverage_15/centered_abs_mean": 0.16399608552455902,
"signal/batch_coverage_15/group_std_mean": 0.21192810237407683,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.028209054842591286,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0023451440967619417,
"signal/batch_coverage_20/centered_abs_mean": 0.1698843240737915,
"signal/batch_coverage_20/group_std_mean": 0.22043935358524322,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.029261283949017523,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0024293458089232446,
"signal/batch_coverage_25/centered_abs_mean": 0.17391247749328614,
"signal/batch_coverage_25/group_std_mean": 0.22539261877536773,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.029883557930588722,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0024869484361261128,
"signal/batch_coverage_5/centered_abs_mean": 0.15491499304771422,
"signal/batch_coverage_5/group_std_mean": 0.19899119138717652,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.02663789503276348,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0022152844350785015,
"signal/brier_reward/centered_abs_mean": 0.12495807856321335,
"signal/brier_reward/group_std_mean": 0.17128449082374572,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.14955242574214936,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01249580793082714,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.042324337363243106,
"signal/confidence_uniqueness_reward/group_std_mean": 0.0849258303642273,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.050359965115785596,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0042324337176978585,
"signal/format_reward/centered_abs_mean": 0.0322645403444767,
"signal/format_reward/group_std_mean": 0.07438525781035424,
"signal/format_reward/group_zero_std_frac": 0.6472222447395325,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.1907156676054001,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01613227017223835,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33729679584503175,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40455949306488037,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4041153252124786,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03372968137264252,
"step": 100
},
{
"epoch": 0.23999700003749952,
"eval_calibration/aurc": 0.1469159218326392,
"eval_calibration/batch_distribution_entropy": 0.8635779316733615,
"eval_calibration/buffer_distribution_entropy": 0.9188378744179434,
"eval_calibration/confidence_entropy": 0.47168668069352604,
"eval_calibration/coverage@0%": 0.28410618279569894,
"eval_calibration/coverage@1%": 0.28410618279569894,
"eval_calibration/coverage@10%": 0.46387768817204295,
"eval_calibration/coverage@15%": 0.5174731182795699,
"eval_calibration/coverage@20%": 0.7014448924731184,
"eval_calibration/coverage@25%": 0.8887768817204301,
"eval_calibration/coverage@30%": 0.957997311827957,
"eval_calibration/coverage@5%": 0.29485887096774194,
"eval_calibration/ece": 0.17558535769489247,
"eval_calibration/mean_confidence": 0.6230294628696237,
"eval_completions/clipped_ratio": 0.028472222222222215,
"eval_completions/max_length": 2266.0,
"eval_completions/max_terminated_length": 2266.0,
"eval_completions/mean_length": 773.9508870442709,
"eval_completions/mean_terminated_length": 796.6577555338541,
"eval_completions/min_length": 0.0,
"eval_completions/min_terminated_length": 319.0,
"eval_loss": 0.0,
"eval_num_tokens": 218981675.0,
"eval_reward": 0.9049823184808096,
"eval_reward_std": 0.27190424005190533,
"eval_rewards/accuracy_reward": 0.6814236144224802,
"eval_rewards/batch_coverage_0": 0.02894584136083722,
"eval_rewards/batch_coverage_1": 0.02894584136083722,
"eval_rewards/batch_coverage_10": 0.05388917370388905,
"eval_rewards/batch_coverage_15": 0.08727520704269409,
"eval_rewards/batch_coverage_20": 0.15597188969453177,
"eval_rewards/batch_coverage_25": 0.21343981474637985,
"eval_rewards/batch_coverage_5": 0.03746440044293801,
"eval_rewards/brier_reward": 0.799796978632609,
"eval_rewards/confidence_uniqueness_reward": 0.8673709034919739,
"eval_rewards/format_reward": 0.972222218910853,
"eval_rewards/frontier_entropy_batch_reward": -0.972222218910853,
"eval_runtime": 214.923,
"eval_samples_per_second": 4.653,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4163954009612401,
"eval_signal/accuracy_reward/group_std_mean": 0.46188920736312866,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.784879336754481,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.20819770048062006,
"eval_signal/advantage_abs_mean": 0.848372608423233,
"eval_signal/advantage_pre_scale_abs_mean": 0.23060731341441473,
"eval_signal/advantage_pre_scale_std": 0.27091872692108154,
"eval_signal/advantage_std": 0.9864526291688284,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.15962670495112738,
"eval_signal/batch_coverage_0/group_std_mean": 0.2362965246041616,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.008654607847953836,
"eval_signal/batch_coverage_0/weight": 0.014299999922513962,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022826618903006115,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.15962670495112738,
"eval_signal/batch_coverage_1/group_std_mean": 0.2362965246041616,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.008654607847953836,
"eval_signal/batch_coverage_1/weight": 0.014299999922513962,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.0022826618903006115,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.13342495014270148,
"eval_signal/batch_coverage_10/group_std_mean": 0.1891984591881434,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.007212459032113354,
"eval_signal/batch_coverage_10/weight": 0.014299999922513962,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.0019079767322788637,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.13718673835198084,
"eval_signal/batch_coverage_15/group_std_mean": 0.18474465360244116,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.007393745006993413,
"eval_signal/batch_coverage_15/weight": 0.014299999922513962,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.0019617703510448337,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.18008214980363846,
"eval_signal/batch_coverage_20/group_std_mean": 0.22891838351885477,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.009653995279222727,
"eval_signal/batch_coverage_20/weight": 0.014299999922513962,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025751747113342085,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.24329387893279394,
"eval_signal/batch_coverage_25/group_std_mean": 0.3025979946057002,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.013066323939710855,
"eval_signal/batch_coverage_25/weight": 0.014299999922513962,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.0034791024712224803,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.1504707137743632,
"eval_signal/batch_coverage_5/group_std_mean": 0.21995937327543894,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.008191281541561088,
"eval_signal/batch_coverage_5/weight": 0.014299999922513962,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.002151731246461471,
"eval_signal/brier_reward/centered_abs_mean": 0.19535045325756073,
"eval_signal/brier_reward/group_std_mean": 0.257330559194088,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07330747569600742,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.019535046070814133,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.07103816606104374,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.12991442531347275,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.026370037036637466,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007103816761324803,
"eval_signal/format_reward/centered_abs_mean": 0.05219184048473835,
"eval_signal/format_reward/group_std_mean": 0.119682926684618,
"eval_signal/format_reward/group_zero_std_frac": 0.4444444552063942,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.09510742562512557,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.026095920242369175,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.05219184048473835,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.119682926684618,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.4444444552063942,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.019021486553053062,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.005219184250260393,
"eval_steps_per_second": 0.028,
"step": 100
},
{
"calibration/aurc": 0.23941046945136274,
"calibration/batch_distribution_entropy": 0.9536971303588615,
"calibration/buffer_distribution_entropy": 0.9202938946779904,
"calibration/confidence_entropy": 0.4764432138439312,
"calibration/coverage@0%": 0.0743582721614013,
"calibration/coverage@1%": 0.0743582721614013,
"calibration/coverage@10%": 0.2693344966610982,
"calibration/coverage@15%": 0.37163524177588975,
"calibration/coverage@20%": 0.46363612638705776,
"calibration/coverage@25%": 0.5449943955053795,
"calibration/coverage@30%": 0.5929579093432007,
"calibration/coverage@5%": 0.19338995056617767,
"calibration/ece": 0.11256938371168097,
"calibration/mean_confidence": 0.5986923518708889,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03272569444444442,
"completions/max_length": 3670.0,
"completions/max_terminated_length": 3670.0,
"completions/mean_length": 784.5158935546875,
"completions/mean_terminated_length": 811.465380859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 258.6,
"epoch": 0.2519968500393745,
"grad_norm": 0.0023560032714158297,
"learning_rate": 3.1024096385542172e-06,
"loss": -0.1058,
"num_tokens": 231096162.0,
"reward": 0.9873493909835815,
"reward_std": 0.19203925132751465,
"rewards/accuracy_reward": 0.6542534589767456,
"rewards/batch_coverage_0": 0.3244723677635193,
"rewards/batch_coverage_1": 0.3244723677635193,
"rewards/batch_coverage_10": 0.3919206619262695,
"rewards/batch_coverage_15": 0.39909898638725283,
"rewards/batch_coverage_20": 0.4087284207344055,
"rewards/batch_coverage_25": 0.41347445249557496,
"rewards/batch_coverage_5": 0.369165313243866,
"rewards/brier_reward": 0.7991156458854676,
"rewards/confidence_uniqueness_reward": 0.9148384690284729,
"rewards/format_reward": 0.9669270873069763,
"rewards/frontier_entropy_batch_reward": -0.32264376878738404,
"signal/accuracy_reward/centered_abs_mean": 0.162060546875,
"signal/accuracy_reward/group_std_mean": 0.22576369643211364,
"signal/accuracy_reward/group_zero_std_frac": 0.3111111134290695,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.7692596673965454,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0810302734375,
"signal/advantage_abs_mean": 0.6655745029449462,
"signal/advantage_pre_scale_abs_mean": 0.12335889935493469,
"signal/advantage_pre_scale_std": 0.21603835821151735,
"signal/advantage_std": 0.9835192799568176,
"signal/batch_coverage_0/centered_abs_mean": 0.15007852017879486,
"signal/batch_coverage_0/group_std_mean": 0.19404472410678864,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.020451470464468,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0021461226744577287,
"signal/batch_coverage_1/centered_abs_mean": 0.15007852017879486,
"signal/batch_coverage_1/group_std_mean": 0.19404472410678864,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.020451470464468,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0021461226744577287,
"signal/batch_coverage_10/centered_abs_mean": 0.16576945781707764,
"signal/batch_coverage_10/group_std_mean": 0.21533091962337494,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.022668870538473128,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0023705032654106617,
"signal/batch_coverage_15/centered_abs_mean": 0.1676239103078842,
"signal/batch_coverage_15/group_std_mean": 0.2177862823009491,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.022917681932449342,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002397021884098649,
"signal/batch_coverage_20/centered_abs_mean": 0.1729596495628357,
"signal/batch_coverage_20/group_std_mean": 0.2248106360435486,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.023749194294214248,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0024733229540288447,
"signal/batch_coverage_25/centered_abs_mean": 0.1749100923538208,
"signal/batch_coverage_25/group_std_mean": 0.22794493436813354,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.024009492993354798,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0025012143421918155,
"signal/batch_coverage_5/centered_abs_mean": 0.16244375109672546,
"signal/batch_coverage_5/group_std_mean": 0.2103522926568985,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.02219676934182644,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002322945697233081,
"signal/brier_reward/centered_abs_mean": 0.13956743776798247,
"signal/brier_reward/group_std_mean": 0.19816478788852693,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.13259952664375305,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013956744223833084,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06475906372070313,
"signal/confidence_uniqueness_reward/group_std_mean": 0.1287354990839958,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.061011632531881334,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006475906539708376,
"signal/format_reward/centered_abs_mean": 0.06034613698720932,
"signal/format_reward/group_std_mean": 0.1259764924645424,
"signal/format_reward/group_zero_std_frac": 0.45277778506278993,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.28261774182319643,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.03017306849360466,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34150766730308535,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40601075887680055,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.32770722508430483,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03415076658129692,
"step": 105
},
{
"calibration/aurc": 0.13983259526717134,
"calibration/batch_distribution_entropy": 0.9209507980047758,
"calibration/buffer_distribution_entropy": 0.9227978962787329,
"calibration/confidence_entropy": 0.4329057677807843,
"calibration/coverage@0%": 0.09626697385971085,
"calibration/coverage@1%": 0.09836121469740719,
"calibration/coverage@10%": 0.4027895951617685,
"calibration/coverage@15%": 0.6134134086345293,
"calibration/coverage@20%": 0.6779665906935597,
"calibration/coverage@25%": 0.8846078868218783,
"calibration/coverage@30%": 0.9452127659574469,
"calibration/coverage@5%": 0.2805005944055738,
"calibration/ece": 0.12565381656881222,
"calibration/mean_confidence": 0.6210956583945125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02430555555555556,
"completions/max_length": 3419.6,
"completions/max_terminated_length": 3419.6,
"completions/mean_length": 795.7691772460937,
"completions/mean_terminated_length": 815.7261596679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 277.4,
"epoch": 0.2639967000412495,
"grad_norm": 0.002739675110206008,
"learning_rate": 2.9518072289156627e-06,
"loss": -0.077,
"num_tokens": 243371871.0,
"reward": 1.0169583320617677,
"reward_std": 0.16964165568351747,
"rewards/accuracy_reward": 0.7042534828186036,
"rewards/batch_coverage_0": 0.3346091389656067,
"rewards/batch_coverage_1": 0.3346091389656067,
"rewards/batch_coverage_10": 0.40635396242141725,
"rewards/batch_coverage_15": 0.4123306512832642,
"rewards/batch_coverage_20": 0.41916595697402953,
"rewards/batch_coverage_25": 0.42104737758636473,
"rewards/batch_coverage_5": 0.37122017741203306,
"rewards/brier_reward": 0.8008349299430847,
"rewards/confidence_uniqueness_reward": 0.9194580793380738,
"rewards/format_reward": 0.9751736164093018,
"rewards/frontier_entropy_batch_reward": -0.3338502585887909,
"signal/accuracy_reward/centered_abs_mean": 0.1485080301761627,
"signal/accuracy_reward/group_std_mean": 0.20822436213493348,
"signal/accuracy_reward/group_zero_std_frac": 0.3527777910232544,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8229191660881042,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07425401508808135,
"signal/advantage_abs_mean": 0.6725971579551697,
"signal/advantage_pre_scale_abs_mean": 0.11048495620489121,
"signal/advantage_pre_scale_std": 0.19505227506160736,
"signal/advantage_std": 0.983374273777008,
"signal/batch_coverage_0/centered_abs_mean": 0.14558544754981995,
"signal/batch_coverage_0/group_std_mean": 0.18584783673286437,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.02308763712644577,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002081871870905161,
"signal/batch_coverage_1/centered_abs_mean": 0.14558544754981995,
"signal/batch_coverage_1/group_std_mean": 0.18584783673286437,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.02308763712644577,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002081871870905161,
"signal/batch_coverage_10/centered_abs_mean": 0.1662052035331726,
"signal/batch_coverage_10/group_std_mean": 0.2154100239276886,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.02644023597240448,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0023767344187945127,
"signal/batch_coverage_15/centered_abs_mean": 0.16461114585399628,
"signal/batch_coverage_15/group_std_mean": 0.21437348127365113,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.026267021521925928,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002353939367458224,
"signal/batch_coverage_20/centered_abs_mean": 0.16556849777698518,
"signal/batch_coverage_20/group_std_mean": 0.21627365350723265,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.026395087689161302,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002367629436776042,
"signal/batch_coverage_25/centered_abs_mean": 0.16443374752998352,
"signal/batch_coverage_25/group_std_mean": 0.21521584689617157,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.026125259697437286,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0023514027241617443,
"signal/batch_coverage_5/centered_abs_mean": 0.15621961951255797,
"signal/batch_coverage_5/group_std_mean": 0.2001771241426468,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.024791359528899194,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0022339405957609417,
"signal/brier_reward/centered_abs_mean": 0.13265540301799775,
"signal/brier_reward/group_std_mean": 0.18497947752475738,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.14658557772636413,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013265540637075902,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.05351239144802093,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10313453078269959,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05855522751808166,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005351239163428545,
"signal/format_reward/centered_abs_mean": 0.04409722238779068,
"signal/format_reward/group_std_mean": 0.09439656734466553,
"signal/format_reward/group_zero_std_frac": 0.5666666805744172,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.2371739089488983,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.02204861119389534,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33227952718734743,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4028955101966858,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.3693257212638855,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03322795443236828,
"step": 110
},
{
"calibration/aurc": 0.22413314193294318,
"calibration/batch_distribution_entropy": 0.9561271819585884,
"calibration/buffer_distribution_entropy": 0.9254413068575653,
"calibration/confidence_entropy": 0.48204974103909837,
"calibration/coverage@0%": 0.02640799142926977,
"calibration/coverage@1%": 0.02640799142926977,
"calibration/coverage@10%": 0.25870477742214976,
"calibration/coverage@15%": 0.4443397401757377,
"calibration/coverage@20%": 0.61614655555679,
"calibration/coverage@25%": 0.6815468359673653,
"calibration/coverage@30%": 0.7126977128538204,
"calibration/coverage@5%": 0.10948419227001834,
"calibration/ece": 0.16517593758056204,
"calibration/mean_confidence": 0.5588173910737515,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01909722222222223,
"completions/max_length": 3680.8,
"completions/max_terminated_length": 3680.8,
"completions/mean_length": 804.56328125,
"completions/mean_terminated_length": 820.0994506835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 290.6,
"epoch": 0.27599655004312446,
"grad_norm": 0.0027605455834418535,
"learning_rate": 2.8012048192771087e-06,
"loss": -0.059,
"num_tokens": 255719640.0,
"reward": 1.007480835914612,
"reward_std": 0.1431771606206894,
"rewards/accuracy_reward": 0.6782118082046509,
"rewards/batch_coverage_0": 0.3108770728111267,
"rewards/batch_coverage_1": 0.3108770728111267,
"rewards/batch_coverage_10": 0.3816157579421997,
"rewards/batch_coverage_15": 0.39420172572135925,
"rewards/batch_coverage_20": 0.4015673458576202,
"rewards/batch_coverage_25": 0.40532844662666323,
"rewards/batch_coverage_5": 0.35119245052337644,
"rewards/brier_reward": 0.7949256420135498,
"rewards/confidence_uniqueness_reward": 0.928059709072113,
"rewards/format_reward": 0.9809027791023255,
"rewards/frontier_entropy_batch_reward": -0.30920955538749695,
"signal/accuracy_reward/centered_abs_mean": 0.14369032084941863,
"signal/accuracy_reward/group_std_mean": 0.19022368788719177,
"signal/accuracy_reward/group_zero_std_frac": 0.45277778506278993,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9364258766174316,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07184516042470931,
"signal/advantage_abs_mean": 0.7183647990226746,
"signal/advantage_pre_scale_abs_mean": 0.10155642777681351,
"signal/advantage_pre_scale_std": 0.17249636948108674,
"signal/advantage_std": 0.9831730961799622,
"signal/batch_coverage_0/centered_abs_mean": 0.1581096827983856,
"signal/batch_coverage_0/group_std_mean": 0.20030168890953065,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0294973898679018,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022609684616327285,
"signal/batch_coverage_1/centered_abs_mean": 0.1581096827983856,
"signal/batch_coverage_1/group_std_mean": 0.20030168890953065,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0294973898679018,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0022609684616327285,
"signal/batch_coverage_10/centered_abs_mean": 0.179025998711586,
"signal/batch_coverage_10/group_std_mean": 0.22901394069194794,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.033418096229434015,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0025600716937333344,
"signal/batch_coverage_15/centered_abs_mean": 0.18170326352119445,
"signal/batch_coverage_15/group_std_mean": 0.2334654837846756,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03398225046694279,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002598356641829014,
"signal/batch_coverage_20/centered_abs_mean": 0.18451933860778807,
"signal/batch_coverage_20/group_std_mean": 0.23735863268375396,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03448176868259907,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002638626517727971,
"signal/batch_coverage_25/centered_abs_mean": 0.1879439651966095,
"signal/batch_coverage_25/group_std_mean": 0.24171611070632934,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03511525765061378,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0026875986717641354,
"signal/batch_coverage_5/centered_abs_mean": 0.16925646960735322,
"signal/batch_coverage_5/group_std_mean": 0.21534042656421662,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03158362843096256,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0024203675333410502,
"signal/brier_reward/centered_abs_mean": 0.13752256333827972,
"signal/brier_reward/group_std_mean": 0.18067309260368347,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1799521267414093,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013752257265150548,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.04287937767803669,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07394094616174698,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0566208966076374,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004287937888875604,
"signal/format_reward/centered_abs_mean": 0.03215060792863369,
"signal/format_reward/group_std_mean": 0.06199740841984749,
"signal/format_reward/group_zero_std_frac": 0.7361111044883728,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.21110976338386536,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.016075303964316844,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3243951678276062,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3954238653182983,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4267133414745331,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.032439516857266426,
"step": 115
},
{
"calibration/aurc": 0.21397657408258555,
"calibration/batch_distribution_entropy": 0.9258972265984567,
"calibration/buffer_distribution_entropy": 0.9280627554882404,
"calibration/confidence_entropy": 0.44351669627470247,
"calibration/coverage@0%": 0.04573898394481331,
"calibration/coverage@1%": 0.04573898394481331,
"calibration/coverage@10%": 0.4020745849981918,
"calibration/coverage@15%": 0.49198784044610433,
"calibration/coverage@20%": 0.5461041566090393,
"calibration/coverage@25%": 0.5702239862091493,
"calibration/coverage@30%": 0.6633931622763249,
"calibration/coverage@5%": 0.13628652176502468,
"calibration/ece": 0.134714352205786,
"calibration/mean_confidence": 0.6368109718865416,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006423611111111139,
"completions/max_length": 3506.6,
"completions/max_terminated_length": 3506.6,
"completions/mean_length": 806.577001953125,
"completions/mean_terminated_length": 811.7610961914063,
"completions/min_length": 0.0,
"completions/min_terminated_length": 297.2,
"epoch": 0.28799640004499943,
"grad_norm": 0.002654542215168476,
"learning_rate": 2.6506024096385547e-06,
"loss": -0.0263,
"num_tokens": 268093263.0,
"reward": 1.0258622884750366,
"reward_std": 0.1160861998796463,
"rewards/accuracy_reward": 0.6960069537162781,
"rewards/batch_coverage_0": 0.3644671857357025,
"rewards/batch_coverage_1": 0.3644671857357025,
"rewards/batch_coverage_10": 0.44366928935050964,
"rewards/batch_coverage_15": 0.45305138230323794,
"rewards/batch_coverage_20": 0.46463318467140197,
"rewards/batch_coverage_25": 0.4713035702705383,
"rewards/batch_coverage_5": 0.41088991761207583,
"rewards/brier_reward": 0.8357924342155456,
"rewards/confidence_uniqueness_reward": 0.9336610436439514,
"rewards/format_reward": 0.9934895873069763,
"rewards/frontier_entropy_batch_reward": -0.38337743282318115,
"signal/accuracy_reward/centered_abs_mean": 0.12651909589767457,
"signal/accuracy_reward/group_std_mean": 0.17061871588230132,
"signal/accuracy_reward/group_zero_std_frac": 0.49722222685813905,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0228377342224122,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06325954794883729,
"signal/advantage_abs_mean": 0.7268272519111634,
"signal/advantage_pre_scale_abs_mean": 0.08354652673006058,
"signal/advantage_pre_scale_std": 0.1430157944560051,
"signal/advantage_std": 0.9828723788261413,
"signal/batch_coverage_0/centered_abs_mean": 0.15081151723861694,
"signal/batch_coverage_0/group_std_mean": 0.1929273337125778,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03486100696027279,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002156604756601155,
"signal/batch_coverage_1/centered_abs_mean": 0.15081151723861694,
"signal/batch_coverage_1/group_std_mean": 0.1929273337125778,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03486100696027279,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002156604756601155,
"signal/batch_coverage_10/centered_abs_mean": 0.17233260571956635,
"signal/batch_coverage_10/group_std_mean": 0.22250266671180724,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03987372517585754,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0024643562734127044,
"signal/batch_coverage_15/centered_abs_mean": 0.17618072628974915,
"signal/batch_coverage_15/group_std_mean": 0.22784018516540527,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.040774937719106674,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0025193843990564345,
"signal/batch_coverage_20/centered_abs_mean": 0.1772436648607254,
"signal/batch_coverage_20/group_std_mean": 0.23021324276924132,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04101964309811592,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025345843750983477,
"signal/batch_coverage_25/centered_abs_mean": 0.18081068694591523,
"signal/batch_coverage_25/group_std_mean": 0.2347923129796982,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04183421730995178,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0025855927728116513,
"signal/batch_coverage_5/centered_abs_mean": 0.16231553256511688,
"signal/batch_coverage_5/group_std_mean": 0.2089778631925583,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.037541044503450395,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023211120162159203,
"signal/brier_reward/centered_abs_mean": 0.11250930428504943,
"signal/brier_reward/group_std_mean": 0.1498480796813965,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18200299441814421,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011250930652022362,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.029111380875110626,
"signal/confidence_uniqueness_reward/group_std_mean": 0.04803909808397293,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.047121844440698626,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0029111379291862248,
"signal/format_reward/centered_abs_mean": 0.012060546688735485,
"signal/format_reward/group_std_mean": 0.027607891708612442,
"signal/format_reward/group_zero_std_frac": 0.8694444417953491,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.09729969501495361,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006030273344367743,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3399507701396942,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40973573327064516,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5499330401420593,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0339950755238533,
"step": 120
},
{
"calibration/aurc": 0.12505652489525854,
"calibration/batch_distribution_entropy": 0.9053102395267109,
"calibration/buffer_distribution_entropy": 0.9297300307812222,
"calibration/confidence_entropy": 0.4474353663526843,
"calibration/coverage@0%": 0.09448594417348308,
"calibration/coverage@1%": 0.2533700281595579,
"calibration/coverage@10%": 0.537857101230514,
"calibration/coverage@15%": 0.6128202678308693,
"calibration/coverage@20%": 0.6741103541459968,
"calibration/coverage@25%": 0.8841481912099155,
"calibration/coverage@30%": 0.9522485533204741,
"calibration/coverage@5%": 0.33825546076703933,
"calibration/ece": 0.1661430153743542,
"calibration/mean_confidence": 0.6058376390647029,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003645833333333304,
"completions/max_length": 2546.0,
"completions/max_terminated_length": 2546.0,
"completions/mean_length": 796.4169311523438,
"completions/mean_terminated_length": 799.3395141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.6,
"epoch": 0.2999962500468744,
"grad_norm": 0.002558889100328088,
"learning_rate": 2.5e-06,
"loss": -0.0092,
"num_tokens": 280385650.0,
"reward": 1.0349355697631837,
"reward_std": 0.11634077429771424,
"rewards/accuracy_reward": 0.7077256917953492,
"rewards/batch_coverage_0": 0.33946998715400695,
"rewards/batch_coverage_1": 0.33946998715400695,
"rewards/batch_coverage_10": 0.4023483574390411,
"rewards/batch_coverage_15": 0.4080226182937622,
"rewards/batch_coverage_20": 0.42021956443786623,
"rewards/batch_coverage_25": 0.42603095173835753,
"rewards/batch_coverage_5": 0.37814598679542544,
"rewards/brier_reward": 0.8199628710746765,
"rewards/confidence_uniqueness_reward": 0.9409253120422363,
"rewards/format_reward": 0.9963541626930237,
"rewards/frontier_entropy_batch_reward": -0.31999213695526124,
"signal/accuracy_reward/centered_abs_mean": 0.14109700471162795,
"signal/accuracy_reward/group_std_mean": 0.18465364575386048,
"signal/accuracy_reward/group_zero_std_frac": 0.4777777850627899,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0281545758247375,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07054850235581397,
"signal/advantage_abs_mean": 0.7421541452407837,
"signal/advantage_pre_scale_abs_mean": 0.08656396716833115,
"signal/advantage_pre_scale_std": 0.13928102254867553,
"signal/advantage_std": 0.9830185294151306,
"signal/batch_coverage_0/centered_abs_mean": 0.15885628163814544,
"signal/batch_coverage_0/group_std_mean": 0.200607630610466,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.033229194954037665,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022716447710990906,
"signal/batch_coverage_1/centered_abs_mean": 0.15885628163814544,
"signal/batch_coverage_1/group_std_mean": 0.200607630610466,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.033229194954037665,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0022716447710990906,
"signal/batch_coverage_10/centered_abs_mean": 0.17724139988422394,
"signal/batch_coverage_10/group_std_mean": 0.22531118392944335,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.037116704136133195,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0025345519650727512,
"signal/batch_coverage_15/centered_abs_mean": 0.17550498247146606,
"signal/batch_coverage_15/group_std_mean": 0.22393256425857544,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.036869344860315324,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002509721275418997,
"signal/batch_coverage_20/centered_abs_mean": 0.1801792323589325,
"signal/batch_coverage_20/group_std_mean": 0.23051097393035888,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.037887966632843016,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025765630416572094,
"signal/batch_coverage_25/centered_abs_mean": 0.1810545653104782,
"signal/batch_coverage_25/group_std_mean": 0.2324410378932953,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03815377466380596,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002589080249890685,
"signal/batch_coverage_5/centered_abs_mean": 0.16935512125492097,
"signal/batch_coverage_5/group_std_mean": 0.21444514393806458,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0354438565671444,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.00242177820764482,
"signal/brier_reward/centered_abs_mean": 0.11938721984624863,
"signal/brier_reward/group_std_mean": 0.15605800747871398,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17510702908039094,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011938722059130668,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.022502795234322547,
"signal/confidence_uniqueness_reward/group_std_mean": 0.035726646333932875,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03346846289932728,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0022502796724438666,
"signal/format_reward/centered_abs_mean": 0.006901041604578495,
"signal/format_reward/group_std_mean": 0.016878756508231164,
"signal/format_reward/group_zero_std_frac": 0.9166666865348816,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.05064408630132675,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0034505208022892475,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3258630931377411,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39527029991149903,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4775148153305054,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0325863104313612,
"step": 125
},
{
"calibration/aurc": 0.18964223588341061,
"calibration/batch_distribution_entropy": 0.8968313820197127,
"calibration/buffer_distribution_entropy": 0.9302606312327433,
"calibration/confidence_entropy": 0.4109658348973223,
"calibration/coverage@0%": 0.029853725752874115,
"calibration/coverage@1%": 0.06745163698002817,
"calibration/coverage@10%": 0.3061498211511034,
"calibration/coverage@15%": 0.4607180967743406,
"calibration/coverage@20%": 0.6071381242054323,
"calibration/coverage@25%": 0.6690803330006971,
"calibration/coverage@30%": 0.792739877608118,
"calibration/coverage@5%": 0.12567499897475154,
"calibration/ece": 0.12968557612437498,
"calibration/mean_confidence": 0.6015370253885455,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007638888888888906,
"completions/max_length": 2966.8,
"completions/max_terminated_length": 2966.8,
"completions/mean_length": 811.5697998046875,
"completions/mean_terminated_length": 817.8347290039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 249.2,
"epoch": 0.3119961000487494,
"grad_norm": 0.002718381118029356,
"learning_rate": 2.349397590361446e-06,
"loss": -0.0172,
"num_tokens": 292859734.0,
"reward": 1.009836530685425,
"reward_std": 0.12487356960773469,
"rewards/accuracy_reward": 0.6718750119209289,
"rewards/batch_coverage_0": 0.34405243396759033,
"rewards/batch_coverage_1": 0.34405243396759033,
"rewards/batch_coverage_10": 0.4103072345256805,
"rewards/batch_coverage_15": 0.4188034474849701,
"rewards/batch_coverage_20": 0.43137113451957704,
"rewards/batch_coverage_25": 0.4350013732910156,
"rewards/batch_coverage_5": 0.38362832069396974,
"rewards/brier_reward": 0.810641348361969,
"rewards/confidence_uniqueness_reward": 0.932453966140747,
"rewards/format_reward": 0.9923611044883728,
"rewards/frontier_entropy_batch_reward": -0.3616227746009827,
"signal/accuracy_reward/centered_abs_mean": 0.15205078423023224,
"signal/accuracy_reward/group_std_mean": 0.19650846123695373,
"signal/accuracy_reward/group_zero_std_frac": 0.45833333730697634,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.1515358924865722,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07602539211511612,
"signal/advantage_abs_mean": 0.7485694050788879,
"signal/advantage_pre_scale_abs_mean": 0.09428424239158631,
"signal/advantage_pre_scale_std": 0.15209928154945374,
"signal/advantage_std": 0.9829629063606262,
"signal/batch_coverage_0/centered_abs_mean": 0.15308986306190492,
"signal/batch_coverage_0/group_std_mean": 0.19674286246299744,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.033842490240931514,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002189185074530542,
"signal/batch_coverage_1/centered_abs_mean": 0.15308986306190492,
"signal/batch_coverage_1/group_std_mean": 0.19674286246299744,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.033842490240931514,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002189185074530542,
"signal/batch_coverage_10/centered_abs_mean": 0.1709138721227646,
"signal/batch_coverage_10/group_std_mean": 0.22127122581005096,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.037541619315743444,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0024440682493150232,
"signal/batch_coverage_15/centered_abs_mean": 0.1729219973087311,
"signal/batch_coverage_15/group_std_mean": 0.22372837960720063,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03800523579120636,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0024727844633162023,
"signal/batch_coverage_20/centered_abs_mean": 0.17704226374626159,
"signal/batch_coverage_20/group_std_mean": 0.22976912558078766,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03873509056866169,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025317042600363493,
"signal/batch_coverage_25/centered_abs_mean": 0.17734608054161072,
"signal/batch_coverage_25/group_std_mean": 0.23120309710502623,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03868867196142674,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0025360488798469305,
"signal/batch_coverage_5/centered_abs_mean": 0.1632935345172882,
"signal/batch_coverage_5/group_std_mean": 0.2102894514799118,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03598980866372585,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023350975010544063,
"signal/brier_reward/centered_abs_mean": 0.1272154778242111,
"signal/brier_reward/group_std_mean": 0.16492744982242585,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19245673716068268,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012721548043191432,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.031321781873703006,
"signal/confidence_uniqueness_reward/group_std_mean": 0.04696677774190903,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04759872481226921,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.00313217812217772,
"signal/format_reward/centered_abs_mean": 0.012825520895421504,
"signal/format_reward/group_std_mean": 0.024604595825076105,
"signal/format_reward/group_zero_std_frac": 0.8944444537162781,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0938265562057495,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006412760447710752,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.339335960149765,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4103351831436157,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5164508819580078,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033933597058057784,
"step": 130
},
{
"calibration/aurc": 0.1719525584030117,
"calibration/batch_distribution_entropy": 0.9388767749321543,
"calibration/buffer_distribution_entropy": 0.9316017917252024,
"calibration/confidence_entropy": 0.4678890728504398,
"calibration/coverage@0%": 0.08349651871192341,
"calibration/coverage@1%": 0.13143766318537858,
"calibration/coverage@10%": 0.4041857049608355,
"calibration/coverage@15%": 0.4981723237597911,
"calibration/coverage@20%": 0.5843342036553525,
"calibration/coverage@25%": 0.6877284595300261,
"calibration/coverage@30%": 0.814621409921671,
"calibration/coverage@5%": 0.2992629460400348,
"calibration/ece": 0.13737020768853353,
"calibration/mean_confidence": 0.6075435421809182,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004861111111111094,
"completions/max_length": 3164.2,
"completions/max_terminated_length": 3164.2,
"completions/mean_length": 797.4372436523438,
"completions/mean_terminated_length": 801.3845947265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 259.6,
"epoch": 0.32399595005062437,
"grad_norm": 0.002610380295664072,
"learning_rate": 2.1987951807228917e-06,
"loss": -0.0119,
"num_tokens": 305139235.0,
"reward": 1.0320541143417359,
"reward_std": 0.11483577936887741,
"rewards/accuracy_reward": 0.6981770753860473,
"rewards/batch_coverage_0": 0.33804223537445066,
"rewards/batch_coverage_1": 0.33804223537445066,
"rewards/batch_coverage_10": 0.42376949787139895,
"rewards/batch_coverage_15": 0.4312718451023102,
"rewards/batch_coverage_20": 0.43675391674041747,
"rewards/batch_coverage_25": 0.4407239556312561,
"rewards/batch_coverage_5": 0.38694225549697875,
"rewards/brier_reward": 0.8242491722106934,
"rewards/confidence_uniqueness_reward": 0.9419329762458801,
"rewards/format_reward": 0.9949652791023255,
"rewards/frontier_entropy_batch_reward": -0.3111158788204193,
"signal/accuracy_reward/centered_abs_mean": 0.1289333775639534,
"signal/accuracy_reward/group_std_mean": 0.1770886391401291,
"signal/accuracy_reward/group_zero_std_frac": 0.4666666746139526,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9275030612945556,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0644666887819767,
"signal/advantage_abs_mean": 0.73212890625,
"signal/advantage_pre_scale_abs_mean": 0.08376801013946533,
"signal/advantage_pre_scale_std": 0.1376526966691017,
"signal/advantage_std": 0.9830352306365967,
"signal/batch_coverage_0/centered_abs_mean": 0.15243115425109863,
"signal/batch_coverage_0/group_std_mean": 0.19285766780376434,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.031641974300146106,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0021797654684633017,
"signal/batch_coverage_1/centered_abs_mean": 0.15243115425109863,
"signal/batch_coverage_1/group_std_mean": 0.19285766780376434,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.031641974300146106,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0021797654684633017,
"signal/batch_coverage_10/centered_abs_mean": 0.1790243446826935,
"signal/batch_coverage_10/group_std_mean": 0.2288077563047409,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.037215839698910715,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002560048084706068,
"signal/batch_coverage_15/centered_abs_mean": 0.176902437210083,
"signal/batch_coverage_15/group_std_mean": 0.2270435243844986,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03675914704799652,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002529704850167036,
"signal/batch_coverage_20/centered_abs_mean": 0.17587584555149077,
"signal/batch_coverage_20/group_std_mean": 0.2266252100467682,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.036522331088781355,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025150245986878873,
"signal/batch_coverage_25/centered_abs_mean": 0.1777946799993515,
"signal/batch_coverage_25/group_std_mean": 0.2295131653547287,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03697492778301239,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002542464016005397,
"signal/batch_coverage_5/centered_abs_mean": 0.16585546731948853,
"signal/batch_coverage_5/group_std_mean": 0.2102999210357666,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03446479067206383,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023717330768704414,
"signal/brier_reward/centered_abs_mean": 0.11271385103464127,
"signal/brier_reward/group_std_mean": 0.148339182138443,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1631714165210724,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011271385289728642,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.023278644308447838,
"signal/confidence_uniqueness_reward/group_std_mean": 0.035085957124829294,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03395293578505516,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0023278645239770412,
"signal/format_reward/centered_abs_mean": 0.008951822761446238,
"signal/format_reward/group_std_mean": 0.0176251407712698,
"signal/format_reward/group_zero_std_frac": 0.9250000238418579,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.06485291570425034,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.004475911380723119,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32853866219520567,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39873294830322265,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47811604142189024,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03285386860370636,
"step": 135
},
{
"calibration/aurc": 0.14304316351618623,
"calibration/batch_distribution_entropy": 0.9370140718828288,
"calibration/buffer_distribution_entropy": 0.9367364808679118,
"calibration/confidence_entropy": 0.4456631514533238,
"calibration/coverage@0%": 0.058948490813648294,
"calibration/coverage@1%": 0.058948490813648294,
"calibration/coverage@10%": 0.466063812335958,
"calibration/coverage@15%": 0.5719939304461943,
"calibration/coverage@20%": 0.7536909448818897,
"calibration/coverage@25%": 0.8405101706036746,
"calibration/coverage@30%": 0.9127050524934383,
"calibration/coverage@5%": 0.26188074146981627,
"calibration/ece": 0.11045847536540351,
"calibration/mean_confidence": 0.622812944901985,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004513888888888884,
"completions/max_length": 3191.6,
"completions/max_terminated_length": 3191.6,
"completions/mean_length": 807.037158203125,
"completions/mean_terminated_length": 810.6978637695313,
"completions/min_length": 0.0,
"completions/min_terminated_length": 296.2,
"epoch": 0.33599580005249935,
"grad_norm": 0.0026262206956744194,
"learning_rate": 2.0481927710843377e-06,
"loss": -0.0153,
"num_tokens": 317540527.0,
"reward": 1.024456834793091,
"reward_std": 0.11049772948026657,
"rewards/accuracy_reward": 0.6847222208976745,
"rewards/batch_coverage_0": 0.35727536082267763,
"rewards/batch_coverage_1": 0.35727536082267763,
"rewards/batch_coverage_10": 0.43343634605407716,
"rewards/batch_coverage_15": 0.44259007573127745,
"rewards/batch_coverage_20": 0.44810510873794557,
"rewards/batch_coverage_25": 0.4504728734493256,
"rewards/batch_coverage_5": 0.39787633419036866,
"rewards/brier_reward": 0.8221214413642883,
"rewards/confidence_uniqueness_reward": 0.9401463627815246,
"rewards/format_reward": 0.9949652910232544,
"rewards/frontier_entropy_batch_reward": -0.3289829194545746,
"signal/accuracy_reward/centered_abs_mean": 0.1142578125,
"signal/accuracy_reward/group_std_mean": 0.1605698734521866,
"signal/accuracy_reward/group_zero_std_frac": 0.5055555522441864,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8772634863853455,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.05712890625,
"signal/advantage_abs_mean": 0.7328930735588074,
"signal/advantage_pre_scale_abs_mean": 0.07996267676353455,
"signal/advantage_pre_scale_std": 0.13530257642269133,
"signal/advantage_std": 0.9829383611679077,
"signal/batch_coverage_0/centered_abs_mean": 0.15471593141555787,
"signal/batch_coverage_0/group_std_mean": 0.19745731651782988,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0342778779566288,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002212437754496932,
"signal/batch_coverage_1/centered_abs_mean": 0.15471593141555787,
"signal/batch_coverage_1/group_std_mean": 0.19745731651782988,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0342778779566288,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002212437754496932,
"signal/batch_coverage_10/centered_abs_mean": 0.17826486825942994,
"signal/batch_coverage_10/group_std_mean": 0.22812734246253968,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03941522017121315,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002549187559634447,
"signal/batch_coverage_15/centered_abs_mean": 0.18213868141174316,
"signal/batch_coverage_15/group_std_mean": 0.23364066183567048,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.04027082771062851,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0026045831851661204,
"signal/batch_coverage_20/centered_abs_mean": 0.18287501633167266,
"signal/batch_coverage_20/group_std_mean": 0.2352455586194992,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04047075062990189,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002615112857893109,
"signal/batch_coverage_25/centered_abs_mean": 0.18394773900508882,
"signal/batch_coverage_25/group_std_mean": 0.23635469377040863,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04076088219881058,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0026304526720196007,
"signal/batch_coverage_5/centered_abs_mean": 0.1656036376953125,
"signal/batch_coverage_5/group_std_mean": 0.2107256144285202,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.036629929393529895,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002368132071569562,
"signal/brier_reward/centered_abs_mean": 0.1181584894657135,
"signal/brier_reward/group_std_mean": 0.15519708395004272,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18297027945518493,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01181584969162941,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.024365550279617308,
"signal/confidence_uniqueness_reward/group_std_mean": 0.0377134170383215,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.038112782314419745,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.00243655510712415,
"signal/format_reward/centered_abs_mean": 0.008908420242369175,
"signal/format_reward/group_std_mean": 0.018860687501728535,
"signal/format_reward/group_zero_std_frac": 0.9138889074325561,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0694626085460186,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.004454210121184588,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33632318377494813,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40446406602859497,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5234416484832763,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03363232016563415,
"step": 140
},
{
"calibration/aurc": 0.14992865764971097,
"calibration/batch_distribution_entropy": 0.9724585718516892,
"calibration/buffer_distribution_entropy": 0.9480821107150399,
"calibration/confidence_entropy": 0.4557665676761431,
"calibration/coverage@0%": 0.11152017370272649,
"calibration/coverage@1%": 0.13131184036939314,
"calibration/coverage@10%": 0.4511516051011434,
"calibration/coverage@15%": 0.5615957563764291,
"calibration/coverage@20%": 0.6935809696569921,
"calibration/coverage@25%": 0.8204677880386984,
"calibration/coverage@30%": 0.8737851802990326,
"calibration/coverage@5%": 0.2761035070360598,
"calibration/ece": 0.13730210610572074,
"calibration/mean_confidence": 0.5631208144994904,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003645833333333348,
"completions/max_length": 3001.8,
"completions/max_terminated_length": 3001.8,
"completions/mean_length": 838.5323120117188,
"completions/mean_terminated_length": 841.6531982421875,
"completions/min_length": 60.6,
"completions/min_terminated_length": 305.6,
"epoch": 0.34799565005437433,
"grad_norm": 0.002406596438959241,
"learning_rate": 1.8975903614457832e-06,
"loss": -0.0085,
"num_tokens": 330265027.0,
"reward": 1.0442128896713256,
"reward_std": 0.10550991892814636,
"rewards/accuracy_reward": 0.7213541746139527,
"rewards/batch_coverage_0": 0.3693415939807892,
"rewards/batch_coverage_1": 0.3693415939807892,
"rewards/batch_coverage_10": 0.4359207212924957,
"rewards/batch_coverage_15": 0.44635804295539855,
"rewards/batch_coverage_20": 0.45086843371391294,
"rewards/batch_coverage_25": 0.4531724214553833,
"rewards/batch_coverage_5": 0.41727402806282043,
"rewards/brier_reward": 0.8248473882675171,
"rewards/confidence_uniqueness_reward": 0.9416616916656494,
"rewards/format_reward": 0.9961805701255798,
"rewards/frontier_entropy_batch_reward": -0.3327996253967285,
"signal/accuracy_reward/centered_abs_mean": 0.12060546725988389,
"signal/accuracy_reward/group_std_mean": 0.16161861419677734,
"signal/accuracy_reward/group_zero_std_frac": 0.522222226858139,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9881903648376464,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06030273362994194,
"signal/advantage_abs_mean": 0.7486638188362121,
"signal/advantage_pre_scale_abs_mean": 0.07886244356632233,
"signal/advantage_pre_scale_std": 0.1308459535241127,
"signal/advantage_std": 0.9828376650810242,
"signal/batch_coverage_0/centered_abs_mean": 0.16480848789215088,
"signal/batch_coverage_0/group_std_mean": 0.20503064393997192,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03875530213117599,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002356761461123824,
"signal/batch_coverage_1/centered_abs_mean": 0.16480848789215088,
"signal/batch_coverage_1/group_std_mean": 0.20503064393997192,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03875530213117599,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002356761461123824,
"signal/batch_coverage_10/centered_abs_mean": 0.1872877836227417,
"signal/batch_coverage_10/group_std_mean": 0.23541199266910554,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.04408646076917648,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0026782152708619833,
"signal/batch_coverage_15/centered_abs_mean": 0.191227462887764,
"signal/batch_coverage_15/group_std_mean": 0.24096313416957854,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.04502549096941948,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0027345526497811077,
"signal/batch_coverage_20/centered_abs_mean": 0.1915127158164978,
"signal/batch_coverage_20/group_std_mean": 0.24182832539081572,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04509256184101105,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0027386318426579235,
"signal/batch_coverage_25/centered_abs_mean": 0.19325141608715057,
"signal/batch_coverage_25/group_std_mean": 0.24404240250587464,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04548835903406143,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0027634951751679183,
"signal/batch_coverage_5/centered_abs_mean": 0.18021537065505983,
"signal/batch_coverage_5/group_std_mean": 0.2256871372461319,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.04239363595843315,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002577079739421606,
"signal/brier_reward/centered_abs_mean": 0.1192615807056427,
"signal/brier_reward/group_std_mean": 0.15455422103404998,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19625029265880584,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01192615795880556,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02237374596297741,
"signal/confidence_uniqueness_reward/group_std_mean": 0.03215453736484051,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03662349134683609,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0022373746149241926,
"signal/format_reward/centered_abs_mean": 0.006553819449618459,
"signal/format_reward/group_std_mean": 0.012815378420054913,
"signal/format_reward/group_zero_std_frac": 0.9444444417953491,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.05154884457588196,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0032769097248092293,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34422631859779357,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4102538049221039,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5683587074279786,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034422630071640016,
"step": 145
},
{
"calibration/aurc": 0.14924825676357584,
"calibration/batch_distribution_entropy": 0.9685756850446536,
"calibration/buffer_distribution_entropy": 0.9587597216971032,
"calibration/confidence_entropy": 0.46276994094843066,
"calibration/coverage@0%": 0.047424412133929346,
"calibration/coverage@1%": 0.047424412133929346,
"calibration/coverage@10%": 0.37805164489722454,
"calibration/coverage@15%": 0.5497069426977248,
"calibration/coverage@20%": 0.6994764397905759,
"calibration/coverage@25%": 0.8060509380453752,
"calibration/coverage@30%": 0.8744328097731238,
"calibration/coverage@5%": 0.3264170143967666,
"calibration/ece": 0.21123060769717777,
"calibration/mean_confidence": 0.5284815132218634,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005642361111111094,
"completions/max_length": 3231.2,
"completions/max_terminated_length": 3231.2,
"completions/mean_length": 979.1048828125,
"completions/mean_terminated_length": 984.62783203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 304.2,
"epoch": 0.3599955000562493,
"grad_norm": 0.002073184819892049,
"learning_rate": 1.7469879518072292e-06,
"loss": -0.0112,
"num_tokens": 344654651.0,
"reward": 1.03697669506073,
"reward_std": 0.11584978252649307,
"rewards/accuracy_reward": 0.7082465171813965,
"rewards/batch_coverage_0": 0.33286563456058504,
"rewards/batch_coverage_1": 0.33286563456058504,
"rewards/batch_coverage_10": 0.4007334589958191,
"rewards/batch_coverage_15": 0.41015023589134214,
"rewards/batch_coverage_20": 0.4160439193248749,
"rewards/batch_coverage_25": 0.4204522013664246,
"rewards/batch_coverage_5": 0.3733904421329498,
"rewards/brier_reward": 0.8169546723365784,
"rewards/confidence_uniqueness_reward": 0.9425576090812683,
"rewards/format_reward": 0.9940104007720947,
"rewards/frontier_entropy_batch_reward": -0.28520033359527586,
"signal/accuracy_reward/centered_abs_mean": 0.14129231870174408,
"signal/accuracy_reward/group_std_mean": 0.18610043525695802,
"signal/accuracy_reward/group_zero_std_frac": 0.4666666746139526,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0182552337646484,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07064615935087204,
"signal/advantage_abs_mean": 0.7560475707054138,
"signal/advantage_pre_scale_abs_mean": 0.0876899853348732,
"signal/advantage_pre_scale_std": 0.13981059342622756,
"signal/advantage_std": 0.9830022573471069,
"signal/batch_coverage_0/centered_abs_mean": 0.16787650883197786,
"signal/batch_coverage_0/group_std_mean": 0.21175734102725982,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.035611017048358916,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0024006341118365526,
"signal/batch_coverage_1/centered_abs_mean": 0.16787650883197786,
"signal/batch_coverage_1/group_std_mean": 0.21175734102725982,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.035611017048358916,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0024006341118365526,
"signal/batch_coverage_10/centered_abs_mean": 0.19135403931140899,
"signal/batch_coverage_10/group_std_mean": 0.24295100271701814,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.04067305475473404,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0027363628149032595,
"signal/batch_coverage_15/centered_abs_mean": 0.19120026230812073,
"signal/batch_coverage_15/group_std_mean": 0.2438488245010376,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.04066586680710316,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002734163636341691,
"signal/batch_coverage_20/centered_abs_mean": 0.19174001216888428,
"signal/batch_coverage_20/group_std_mean": 0.2444555252790451,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04067807123064995,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002741882111877203,
"signal/batch_coverage_25/centered_abs_mean": 0.193832927942276,
"signal/batch_coverage_25/group_std_mean": 0.2474503844976425,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04112314581871033,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002771810814738274,
"signal/batch_coverage_5/centered_abs_mean": 0.1800677478313446,
"signal/batch_coverage_5/group_std_mean": 0.2272460490465164,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.038200228661298755,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0025749687105417252,
"signal/brier_reward/centered_abs_mean": 0.12617065459489823,
"signal/brier_reward/group_std_mean": 0.16255717277526854,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18507564663887024,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012617065571248531,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.022591123729944228,
"signal/confidence_uniqueness_reward/group_std_mean": 0.033147013187408446,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.033954189717769624,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002259112545289099,
"signal/format_reward/centered_abs_mean": 0.008675130270421504,
"signal/format_reward/group_std_mean": 0.016100356169044972,
"signal/format_reward/group_zero_std_frac": 0.9333333492279052,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.06509829834103584,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.004337565135210752,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32594712972640993,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39480751752853394,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.48510047793388367,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03259471394121647,
"step": 150
},
{
"epoch": 0.3599955000562493,
"eval_calibration/aurc": 0.15099810091529658,
"eval_calibration/batch_distribution_entropy": 0.860150571119397,
"eval_calibration/buffer_distribution_entropy": 0.9641795024573184,
"eval_calibration/confidence_entropy": 0.45122327873025564,
"eval_calibration/coverage@0%": 0.17708333333333334,
"eval_calibration/coverage@1%": 0.17708333333333334,
"eval_calibration/coverage@10%": 0.4791666666666667,
"eval_calibration/coverage@15%": 0.6458333333333334,
"eval_calibration/coverage@20%": 0.8385416666666666,
"eval_calibration/coverage@25%": 0.9270833333333334,
"eval_calibration/coverage@30%": 0.9739583333333334,
"eval_calibration/coverage@5%": 0.28125,
"eval_calibration/ece": 0.2121776145833333,
"eval_calibration/mean_confidence": 0.62442065625,
"eval_completions/clipped_ratio": 0.00434027777777779,
"eval_completions/max_length": 2256.6666666666665,
"eval_completions/max_terminated_length": 2256.6666666666665,
"eval_completions/mean_length": 999.3628641764323,
"eval_completions/mean_terminated_length": 1003.8037719726562,
"eval_completions/min_length": 206.66666666666666,
"eval_completions/min_terminated_length": 420.8333333333333,
"eval_loss": 0.0,
"eval_num_tokens": 344654651.0,
"eval_reward": 0.9219675163427988,
"eval_reward_std": 0.2387667124470075,
"eval_rewards/accuracy_reward": 0.6883680621782938,
"eval_rewards/batch_coverage_0": 0.04401817545294762,
"eval_rewards/batch_coverage_1": 0.04401817545294762,
"eval_rewards/batch_coverage_10": 0.06404042275001605,
"eval_rewards/batch_coverage_15": 0.10943988462289174,
"eval_rewards/batch_coverage_20": 0.15806531036893526,
"eval_rewards/batch_coverage_25": 0.2252073884010315,
"eval_rewards/batch_coverage_5": 0.05134963368376096,
"eval_rewards/brier_reward": 0.81276535987854,
"eval_rewards/confidence_uniqueness_reward": 0.8863547345002493,
"eval_rewards/format_reward": 0.9947916766007742,
"eval_rewards/frontier_entropy_batch_reward": -0.9947916766007742,
"eval_runtime": 177.8666,
"eval_samples_per_second": 5.622,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4130316823720932,
"eval_signal/accuracy_reward/group_std_mean": 0.46021030843257904,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8719471295674642,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2065158411860466,
"eval_signal/advantage_abs_mean": 0.8689649800459543,
"eval_signal/advantage_pre_scale_abs_mean": 0.20798486719528833,
"eval_signal/advantage_pre_scale_std": 0.23649577299753824,
"eval_signal/advantage_std": 0.9864074190457662,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.204094630976518,
"eval_signal/batch_coverage_0/group_std_mean": 0.29198014239470166,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012345977903654179,
"eval_signal/batch_coverage_0/weight": 0.014299999922513962,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.0029185531505693993,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.204094630976518,
"eval_signal/batch_coverage_1/group_std_mean": 0.29198014239470166,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012345977903654179,
"eval_signal/batch_coverage_1/weight": 0.014299999922513962,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.0029185531505693993,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.1684000020225843,
"eval_signal/batch_coverage_10/group_std_mean": 0.23056225727001825,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.010209185304120183,
"eval_signal/batch_coverage_10/weight": 0.014299999922513962,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.0024081199468734362,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.1649662603934606,
"eval_signal/batch_coverage_15/group_std_mean": 0.22036516666412354,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.00998325839949151,
"eval_signal/batch_coverage_15/weight": 0.014299999922513962,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.00235901753573368,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.1855208824078242,
"eval_signal/batch_coverage_20/group_std_mean": 0.23475823799769083,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.011228633423646292,
"eval_signal/batch_coverage_20/weight": 0.014299999922513962,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.002652948683438202,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.259243165453275,
"eval_signal/batch_coverage_25/group_std_mean": 0.31990206241607666,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015624676054964462,
"eval_signal/batch_coverage_25/weight": 0.014299999922513962,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.0037071771221235394,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.19574902951717377,
"eval_signal/batch_coverage_5/group_std_mean": 0.2785518889625867,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.011863870856662592,
"eval_signal/batch_coverage_5/weight": 0.014299999922513962,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.0027992110699415207,
"eval_signal/brier_reward/centered_abs_mean": 0.18697136143843332,
"eval_signal/brier_reward/group_std_mean": 0.24710966646671295,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07883285731077194,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.018697135771314304,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.04890578364332517,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.06993046589195728,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02052827924489975,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0048905784885088606,
"eval_signal/format_reward/centered_abs_mean": 0.010091145678112904,
"eval_signal/format_reward/group_std_mean": 0.029462782355646294,
"eval_signal/format_reward/group_zero_std_frac": 0.8333333532015482,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.020615727019806702,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.005045572839056452,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.010091145678112904,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.029462782355646294,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.8333333532015482,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.004123145559181769,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0010091145522892475,
"eval_steps_per_second": 0.034,
"step": 150
},
{
"calibration/aurc": 0.12354927845408956,
"calibration/batch_distribution_entropy": 0.9447950727046628,
"calibration/buffer_distribution_entropy": 0.9657919927678869,
"calibration/confidence_entropy": 0.4733415813935542,
"calibration/coverage@0%": 0.03344089186362829,
"calibration/coverage@1%": 0.03344089186362829,
"calibration/coverage@10%": 0.5678552001740644,
"calibration/coverage@15%": 0.714853949086162,
"calibration/coverage@20%": 0.8357838337684944,
"calibration/coverage@25%": 0.9004188424717146,
"calibration/coverage@30%": 0.9442232375979112,
"calibration/coverage@5%": 0.33468212598936475,
"calibration/ece": 0.16670257491313864,
"calibration/mean_confidence": 0.6146071591544293,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003038194444444442,
"completions/max_length": 3241.8,
"completions/max_terminated_length": 3241.8,
"completions/mean_length": 996.8377685546875,
"completions/mean_terminated_length": 999.9290771484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 359.8,
"epoch": 0.3719953500581243,
"grad_norm": 0.0023586146999150515,
"learning_rate": 1.5963855421686747e-06,
"loss": -0.0001,
"num_tokens": 359245934.0,
"reward": 1.0534717082977294,
"reward_std": 0.11110627800226211,
"rewards/accuracy_reward": 0.741406238079071,
"rewards/batch_coverage_0": 0.3595211863517761,
"rewards/batch_coverage_1": 0.3595211863517761,
"rewards/batch_coverage_10": 0.4280384540557861,
"rewards/batch_coverage_15": 0.43416666984558105,
"rewards/batch_coverage_20": 0.443005895614624,
"rewards/batch_coverage_25": 0.44818037152290346,
"rewards/batch_coverage_5": 0.40228732824325564,
"rewards/brier_reward": 0.8326889395713806,
"rewards/confidence_uniqueness_reward": 0.9405958533287049,
"rewards/format_reward": 0.9967013955116272,
"rewards/frontier_entropy_batch_reward": -0.3401912569999695,
"signal/accuracy_reward/centered_abs_mean": 0.13376193642616271,
"signal/accuracy_reward/group_std_mean": 0.17467234432697296,
"signal/accuracy_reward/group_zero_std_frac": 0.5055555701255798,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0903648853302002,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06688096821308136,
"signal/advantage_abs_mean": 0.753935980796814,
"signal/advantage_pre_scale_abs_mean": 0.0841080218553543,
"signal/advantage_pre_scale_std": 0.13811270892620087,
"signal/advantage_std": 0.982841157913208,
"signal/batch_coverage_0/centered_abs_mean": 0.15721264779567717,
"signal/batch_coverage_0/group_std_mean": 0.19686352014541625,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03701254278421402,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022481407970190046,
"signal/batch_coverage_1/centered_abs_mean": 0.15721264779567717,
"signal/batch_coverage_1/group_std_mean": 0.19686352014541625,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03701254278421402,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0022481407970190046,
"signal/batch_coverage_10/centered_abs_mean": 0.17821633517742158,
"signal/batch_coverage_10/group_std_mean": 0.22452322840690614,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.04192670062184334,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0025484934914857147,
"signal/batch_coverage_15/centered_abs_mean": 0.1773567318916321,
"signal/batch_coverage_15/group_std_mean": 0.22416456639766694,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.04177441671490669,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.00253620115108788,
"signal/batch_coverage_20/centered_abs_mean": 0.1758658766746521,
"signal/batch_coverage_20/group_std_mean": 0.2232010543346405,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.041483426839113234,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025148820132017136,
"signal/batch_coverage_25/centered_abs_mean": 0.1810859262943268,
"signal/batch_coverage_25/group_std_mean": 0.22983238101005554,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04270444884896278,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002589528728276491,
"signal/batch_coverage_5/centered_abs_mean": 0.16989169120788575,
"signal/batch_coverage_5/group_std_mean": 0.2133048117160797,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03996897637844086,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.00242945128120482,
"signal/brier_reward/centered_abs_mean": 0.11728623360395432,
"signal/brier_reward/group_std_mean": 0.152217036485672,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19326700866222382,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011728623509407043,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.022488463297486307,
"signal/confidence_uniqueness_reward/group_std_mean": 0.03181953355669975,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03719873316586018,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0022488463902845977,
"signal/format_reward/centered_abs_mean": 0.005566406203433872,
"signal/format_reward/group_std_mean": 0.010888969711959361,
"signal/format_reward/group_zero_std_frac": 0.9527777791023254,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.04535420574247837,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.002783203101716936,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33973073959350586,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40602503418922425,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5617510795593261,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033973073959350585,
"step": 155
},
{
"calibration/aurc": 0.10859563453296701,
"calibration/batch_distribution_entropy": 0.9128716647986437,
"calibration/buffer_distribution_entropy": 0.9675145591680068,
"calibration/confidence_entropy": 0.46302861735613277,
"calibration/coverage@0%": 0.14967362924281985,
"calibration/coverage@1%": 0.18256500217580504,
"calibration/coverage@10%": 0.665139670609524,
"calibration/coverage@15%": 0.7675148108312609,
"calibration/coverage@20%": 0.822568986367634,
"calibration/coverage@25%": 0.8676011433597186,
"calibration/coverage@30%": 0.8970976253298154,
"calibration/coverage@5%": 0.5232171997389033,
"calibration/ece": 0.12741486753620287,
"calibration/mean_confidence": 0.653657961062989,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00546875,
"completions/max_length": 3256.4,
"completions/max_terminated_length": 3256.4,
"completions/mean_length": 1002.3953002929687,
"completions/mean_terminated_length": 1007.9062255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 312.6,
"epoch": 0.38399520005999926,
"grad_norm": 0.0021716596093028784,
"learning_rate": 1.4457831325301204e-06,
"loss": -0.0154,
"num_tokens": 373880824.0,
"reward": 1.01896413564682,
"reward_std": 0.11577980667352676,
"rewards/accuracy_reward": 0.6807291746139527,
"rewards/batch_coverage_0": 0.34664570689201357,
"rewards/batch_coverage_1": 0.34664570689201357,
"rewards/batch_coverage_10": 0.41527708768844607,
"rewards/batch_coverage_15": 0.41911380290985106,
"rewards/batch_coverage_20": 0.42773756980895994,
"rewards/batch_coverage_25": 0.431087327003479,
"rewards/batch_coverage_5": 0.38476539254188535,
"rewards/brier_reward": 0.8177592873573303,
"rewards/confidence_uniqueness_reward": 0.9373740196228028,
"rewards/format_reward": 0.9927083134651185,
"rewards/frontier_entropy_batch_reward": -0.32897132635116577,
"signal/accuracy_reward/centered_abs_mean": 0.13287760615348815,
"signal/accuracy_reward/group_std_mean": 0.17416149377822876,
"signal/accuracy_reward/group_zero_std_frac": 0.5083333373069763,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0287292838096618,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06643880307674407,
"signal/advantage_abs_mean": 0.7522549510002137,
"signal/advantage_pre_scale_abs_mean": 0.08743151724338531,
"signal/advantage_pre_scale_std": 0.14282708168029784,
"signal/advantage_std": 0.9829399466514588,
"signal/batch_coverage_0/centered_abs_mean": 0.15453094840049744,
"signal/batch_coverage_0/group_std_mean": 0.19441550970077515,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03416990488767624,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022097925655543806,
"signal/batch_coverage_1/centered_abs_mean": 0.15453094840049744,
"signal/batch_coverage_1/group_std_mean": 0.19441550970077515,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03416990488767624,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0022097925655543806,
"signal/batch_coverage_10/centered_abs_mean": 0.17795575857162477,
"signal/batch_coverage_10/group_std_mean": 0.2252943605184555,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.039376800507307054,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002544767269864678,
"signal/batch_coverage_15/centered_abs_mean": 0.17797406911849975,
"signal/batch_coverage_15/group_std_mean": 0.22535083889961244,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03939326554536819,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002545029018074274,
"signal/batch_coverage_20/centered_abs_mean": 0.18285177052021026,
"signal/batch_coverage_20/group_std_mean": 0.23180427253246308,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04048406258225441,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0026147802826017143,
"signal/batch_coverage_25/centered_abs_mean": 0.18532668352127074,
"signal/batch_coverage_25/group_std_mean": 0.23487534523010253,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.0410227507352829,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0026501716580241918,
"signal/batch_coverage_5/centered_abs_mean": 0.16579327285289763,
"signal/batch_coverage_5/group_std_mean": 0.20903973281383514,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03667203634977341,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023708438500761988,
"signal/brier_reward/centered_abs_mean": 0.12218662053346634,
"signal/brier_reward/group_std_mean": 0.15716723203659058,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18889405727386474,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012218662351369858,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02724316492676735,
"signal/confidence_uniqueness_reward/group_std_mean": 0.039476413279771805,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.042055241763591766,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0027243165764957666,
"signal/format_reward/centered_abs_mean": 0.011024305410683156,
"signal/format_reward/group_std_mean": 0.01946833319962025,
"signal/format_reward/group_zero_std_frac": 0.9222222447395325,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.08364587873220444,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.005512152705341578,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3342073380947113,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4018004834651947,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5187286734580994,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03342073522508145,
"step": 160
},
{
"calibration/aurc": 0.13086824597819938,
"calibration/batch_distribution_entropy": 0.9472034847512509,
"calibration/buffer_distribution_entropy": 0.9666214722500699,
"calibration/confidence_entropy": 0.45481414785057267,
"calibration/coverage@0%": 0.04582357782475198,
"calibration/coverage@1%": 0.09478191115808532,
"calibration/coverage@10%": 0.53772076030555,
"calibration/coverage@15%": 0.6519230017685322,
"calibration/coverage@20%": 0.780576519934485,
"calibration/coverage@25%": 0.8727473524512137,
"calibration/coverage@30%": 0.9090184937611407,
"calibration/coverage@5%": 0.33352284393052795,
"calibration/ece": 0.17592407358849602,
"calibration/mean_confidence": 0.5471417070050063,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0050347222222222095,
"completions/max_length": 3369.8,
"completions/max_terminated_length": 3369.8,
"completions/mean_length": 1023.3869873046875,
"completions/mean_terminated_length": 1028.52685546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 362.0,
"epoch": 0.39599505006187424,
"grad_norm": 0.002039579441770911,
"learning_rate": 1.2951807228915664e-06,
"loss": -0.0096,
"num_tokens": 388809314.0,
"reward": 1.0179754614830017,
"reward_std": 0.11339564919471741,
"rewards/accuracy_reward": 0.6663194417953491,
"rewards/batch_coverage_0": 0.3367995202541351,
"rewards/batch_coverage_1": 0.3367995202541351,
"rewards/batch_coverage_10": 0.3942675650119781,
"rewards/batch_coverage_15": 0.4045478582382202,
"rewards/batch_coverage_20": 0.4149549126625061,
"rewards/batch_coverage_25": 0.4195962190628052,
"rewards/batch_coverage_5": 0.3712313652038574,
"rewards/brier_reward": 0.8144054651260376,
"rewards/confidence_uniqueness_reward": 0.9440905332565308,
"rewards/format_reward": 0.9947048664093018,
"rewards/frontier_entropy_batch_reward": -0.266845241189003,
"signal/accuracy_reward/centered_abs_mean": 0.13079427033662797,
"signal/accuracy_reward/group_std_mean": 0.1758062332868576,
"signal/accuracy_reward/group_zero_std_frac": 0.4861111223697662,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9914317846298217,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06539713516831398,
"signal/advantage_abs_mean": 0.7486802816390992,
"signal/advantage_pre_scale_abs_mean": 0.0846073180437088,
"signal/advantage_pre_scale_std": 0.13776987195014953,
"signal/advantage_std": 0.9829599499702454,
"signal/batch_coverage_0/centered_abs_mean": 0.15715940296649933,
"signal/batch_coverage_0/group_std_mean": 0.2002890944480896,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03427259549498558,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002247379417531192,
"signal/batch_coverage_1/centered_abs_mean": 0.15715940296649933,
"signal/batch_coverage_1/group_std_mean": 0.2002890944480896,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03427259549498558,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002247379417531192,
"signal/batch_coverage_10/centered_abs_mean": 0.17147505581378936,
"signal/batch_coverage_10/group_std_mean": 0.21927900314331056,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.037464487552642825,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002452093129977584,
"signal/batch_coverage_15/centered_abs_mean": 0.1753909856081009,
"signal/batch_coverage_15/group_std_mean": 0.22453003227710724,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03826990127563477,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0025080910883843897,
"signal/batch_coverage_20/centered_abs_mean": 0.17529043555259705,
"signal/batch_coverage_20/group_std_mean": 0.22489323019981383,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03827468380331993,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025066531728953124,
"signal/batch_coverage_25/centered_abs_mean": 0.17560572326183319,
"signal/batch_coverage_25/group_std_mean": 0.22626294195652008,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.0383146844804287,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0025111618917435406,
"signal/batch_coverage_5/centered_abs_mean": 0.16724469661712646,
"signal/batch_coverage_5/group_std_mean": 0.21355923116207123,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0364637590944767,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023915990255773067,
"signal/brier_reward/centered_abs_mean": 0.11954751163721085,
"signal/brier_reward/group_std_mean": 0.15770359933376313,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18224007785320281,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011954751610755921,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.022302094474434854,
"signal/confidence_uniqueness_reward/group_std_mean": 0.03391831777989864,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03458714000880718,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0022302095079794527,
"signal/format_reward/centered_abs_mean": 0.009185112826526166,
"signal/format_reward/group_std_mean": 0.017785289883613588,
"signal/format_reward/group_zero_std_frac": 0.925000011920929,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0728104680776596,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.004592556413263083,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.31106963753700256,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3822822213172913,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4759585916996002,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.031106964498758317,
"step": 165
},
{
"calibration/aurc": 0.1460920793342822,
"calibration/batch_distribution_entropy": 0.8878700887244488,
"calibration/buffer_distribution_entropy": 0.9668655536975838,
"calibration/confidence_entropy": 0.43763451430842687,
"calibration/coverage@0%": 0.018759519147084423,
"calibration/coverage@1%": 0.018759519147084423,
"calibration/coverage@10%": 0.42567357138337325,
"calibration/coverage@15%": 0.6675902364909806,
"calibration/coverage@20%": 0.8197856721949514,
"calibration/coverage@25%": 0.8918473680112828,
"calibration/coverage@30%": 0.943480103345044,
"calibration/coverage@5%": 0.08334285248041776,
"calibration/ece": 0.08988107862443054,
"calibration/mean_confidence": 0.6858332701331313,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004947916666666696,
"completions/max_length": 3697.6,
"completions/max_terminated_length": 3697.6,
"completions/mean_length": 1010.5817749023438,
"completions/mean_terminated_length": 1015.6004760742187,
"completions/min_length": 0.0,
"completions/min_terminated_length": 339.6,
"epoch": 0.4079949000637492,
"grad_norm": 0.0021854499354958534,
"learning_rate": 1.1445783132530121e-06,
"loss": -0.0112,
"num_tokens": 403540400.0,
"reward": 1.0409629344940186,
"reward_std": 0.110990709066391,
"rewards/accuracy_reward": 0.7241319417953491,
"rewards/batch_coverage_0": 0.37385630011558535,
"rewards/batch_coverage_1": 0.37385630011558535,
"rewards/batch_coverage_10": 0.4578577697277069,
"rewards/batch_coverage_15": 0.46536206603050234,
"rewards/batch_coverage_20": 0.472717946767807,
"rewards/batch_coverage_25": 0.47600876092910765,
"rewards/batch_coverage_5": 0.42083380818367006,
"rewards/brier_reward": 0.8414793729782104,
"rewards/confidence_uniqueness_reward": 0.9344589471817016,
"rewards/format_reward": 0.9950520753860473,
"rewards/frontier_entropy_batch_reward": -0.39701979160308837,
"signal/accuracy_reward/centered_abs_mean": 0.1273980036377907,
"signal/accuracy_reward/group_std_mean": 0.17125212252140046,
"signal/accuracy_reward/group_zero_std_frac": 0.5000000119209289,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0747498750686646,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06369900181889535,
"signal/advantage_abs_mean": 0.7358132243156433,
"signal/advantage_pre_scale_abs_mean": 0.08221107721328735,
"signal/advantage_pre_scale_std": 0.13774741888046266,
"signal/advantage_std": 0.9827988147735596,
"signal/batch_coverage_0/centered_abs_mean": 0.15270276367664337,
"signal/batch_coverage_0/group_std_mean": 0.19141809046268463,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03704690709710121,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0021836495259776713,
"signal/batch_coverage_1/centered_abs_mean": 0.15270276367664337,
"signal/batch_coverage_1/group_std_mean": 0.19141809046268463,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03704690709710121,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0021836495259776713,
"signal/batch_coverage_10/centered_abs_mean": 0.17446030974388121,
"signal/batch_coverage_10/group_std_mean": 0.22139351963996887,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.04217695370316506,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002494782442227006,
"signal/batch_coverage_15/centered_abs_mean": 0.17822808623313904,
"signal/batch_coverage_15/group_std_mean": 0.2266361564397812,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.04310111626982689,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0025486615020781757,
"signal/batch_coverage_20/centered_abs_mean": 0.1822631001472473,
"signal/batch_coverage_20/group_std_mean": 0.23209832310676576,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.044050132483243944,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0026063622906804087,
"signal/batch_coverage_25/centered_abs_mean": 0.1828020066022873,
"signal/batch_coverage_25/group_std_mean": 0.2328880548477173,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04419608265161514,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0026140686590224505,
"signal/batch_coverage_5/centered_abs_mean": 0.16622574925422667,
"signal/batch_coverage_5/group_std_mean": 0.20924559533596038,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.040277618914842606,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002377028251066804,
"signal/brier_reward/centered_abs_mean": 0.10948136746883393,
"signal/brier_reward/group_std_mean": 0.14314137399196625,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18513674437999725,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.010948137007653713,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.026617776975035666,
"signal/confidence_uniqueness_reward/group_std_mean": 0.037878865376114845,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04497217386960983,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0026617777068167923,
"signal/format_reward/centered_abs_mean": 0.007883029547519982,
"signal/format_reward/group_std_mean": 0.014791851490736007,
"signal/format_reward/group_zero_std_frac": 0.9361111283302307,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0656326726078987,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.003941514773759991,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.35060847997665406,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4169707238674164,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5929932951927185,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03506084680557251,
"step": 170
},
{
"calibration/aurc": 0.0988401164454502,
"calibration/batch_distribution_entropy": 0.950985405780948,
"calibration/buffer_distribution_entropy": 0.9654078631203739,
"calibration/confidence_entropy": 0.46972934712497294,
"calibration/coverage@0%": 0.1369407160085553,
"calibration/coverage@1%": 0.1495061610347333,
"calibration/coverage@10%": 0.5456195683369263,
"calibration/coverage@15%": 0.7819692461155509,
"calibration/coverage@20%": 0.9106661893478408,
"calibration/coverage@25%": 0.9679260661150175,
"calibration/coverage@30%": 1.0,
"calibration/coverage@5%": 0.32950498555415597,
"calibration/ece": 0.18936169514098597,
"calibration/mean_confidence": 0.5877690730480742,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00894097222222221,
"completions/max_length": 3482.6,
"completions/max_terminated_length": 3482.6,
"completions/mean_length": 1032.5648803710938,
"completions/mean_terminated_length": 1041.8525634765624,
"completions/min_length": 0.0,
"completions/min_terminated_length": 339.6,
"epoch": 0.4199947500656242,
"grad_norm": 0.0020933363121002913,
"learning_rate": 9.93975903614458e-07,
"loss": -0.0201,
"num_tokens": 418543515.0,
"reward": 1.0426448345184327,
"reward_std": 0.11992864906787873,
"rewards/accuracy_reward": 0.7254340291023255,
"rewards/batch_coverage_0": 0.3316203892230988,
"rewards/batch_coverage_1": 0.3316203892230988,
"rewards/batch_coverage_10": 0.4126744449138641,
"rewards/batch_coverage_15": 0.4222039520740509,
"rewards/batch_coverage_20": 0.4333043694496155,
"rewards/batch_coverage_25": 0.43859469294548037,
"rewards/batch_coverage_5": 0.3796039819717407,
"rewards/brier_reward": 0.8254922986030578,
"rewards/confidence_uniqueness_reward": 0.9372236371040344,
"rewards/format_reward": 0.9900173664093017,
"rewards/frontier_entropy_batch_reward": -0.3067204415798187,
"signal/accuracy_reward/centered_abs_mean": 0.1384494349360466,
"signal/accuracy_reward/group_std_mean": 0.18521187007427214,
"signal/accuracy_reward/group_zero_std_frac": 0.4638888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0526963233947755,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0692247174680233,
"signal/advantage_abs_mean": 0.7402390241622925,
"signal/advantage_pre_scale_abs_mean": 0.08802802860736847,
"signal/advantage_pre_scale_std": 0.14837550222873688,
"signal/advantage_std": 0.982962167263031,
"signal/batch_coverage_0/centered_abs_mean": 0.15836002230644225,
"signal/batch_coverage_0/group_std_mean": 0.19843866527080536,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03455947414040565,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022645482793450357,
"signal/batch_coverage_1/centered_abs_mean": 0.15836002230644225,
"signal/batch_coverage_1/group_std_mean": 0.19843866527080536,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03455947414040565,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0022645482793450357,
"signal/batch_coverage_10/centered_abs_mean": 0.18111312985420228,
"signal/batch_coverage_10/group_std_mean": 0.22880154252052307,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03951704278588295,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002589917741715908,
"signal/batch_coverage_15/centered_abs_mean": 0.1822204291820526,
"signal/batch_coverage_15/group_std_mean": 0.23108108341693878,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03973933905363083,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002605752134695649,
"signal/batch_coverage_20/centered_abs_mean": 0.18789599537849427,
"signal/batch_coverage_20/group_std_mean": 0.23864111602306365,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.040997046232223514,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0026869128923863173,
"signal/batch_coverage_25/centered_abs_mean": 0.19290711581707,
"signal/batch_coverage_25/group_std_mean": 0.24501362144947053,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.0420980766415596,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002758571831509471,
"signal/batch_coverage_5/centered_abs_mean": 0.17007304430007936,
"signal/batch_coverage_5/group_std_mean": 0.2132638841867447,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.037092185020446776,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0024320445023477077,
"signal/brier_reward/centered_abs_mean": 0.11827056854963303,
"signal/brier_reward/group_std_mean": 0.15516594052314758,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18027611076831818,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011827056482434272,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02871614247560501,
"signal/confidence_uniqueness_reward/group_std_mean": 0.043540383130311965,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.044160565733909606,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002871614182367921,
"signal/format_reward/centered_abs_mean": 0.01482747383415699,
"signal/format_reward/group_std_mean": 0.02657197751104832,
"signal/format_reward/group_zero_std_frac": 0.8944444537162781,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.11438467130064964,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007413736917078495,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32246204614639284,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3913597702980042,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.49285311698913575,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03224620372056961,
"step": 175
},
{
"calibration/aurc": 0.07447960795488509,
"calibration/batch_distribution_entropy": 0.9584111436492234,
"calibration/buffer_distribution_entropy": 0.9659745360423712,
"calibration/confidence_entropy": 0.4698024707118339,
"calibration/coverage@0%": 0.08605942025799326,
"calibration/coverage@1%": 0.2450125384594614,
"calibration/coverage@10%": 0.7343520715607339,
"calibration/coverage@15%": 0.8616261686041685,
"calibration/coverage@20%": 0.9261610751354826,
"calibration/coverage@25%": 0.9570680628272251,
"calibration/coverage@30%": 0.9738219895287958,
"calibration/coverage@5%": 0.5089204790445596,
"calibration/ece": 0.20025433500219494,
"calibration/mean_confidence": 0.5928821674128748,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009027777777777768,
"completions/max_length": 3789.4,
"completions/max_terminated_length": 3789.4,
"completions/mean_length": 999.3691162109375,
"completions/mean_terminated_length": 1008.4784057617187,
"completions/min_length": 0.0,
"completions/min_terminated_length": 367.6,
"epoch": 0.4319946000674992,
"grad_norm": 0.0020875383634120226,
"learning_rate": 8.433734939759036e-07,
"loss": -0.0193,
"num_tokens": 433156215.0,
"reward": 1.0356385707855225,
"reward_std": 0.12121915966272354,
"rewards/accuracy_reward": 0.7159722328186036,
"rewards/batch_coverage_0": 0.3323968589305878,
"rewards/batch_coverage_1": 0.3323968589305878,
"rewards/batch_coverage_10": 0.390076208114624,
"rewards/batch_coverage_15": 0.3979366660118103,
"rewards/batch_coverage_20": 0.409983891248703,
"rewards/batch_coverage_25": 0.41360154151916506,
"rewards/batch_coverage_5": 0.36534075140953065,
"rewards/brier_reward": 0.8069019794464112,
"rewards/confidence_uniqueness_reward": 0.9386700630187989,
"rewards/format_reward": 0.990625,
"rewards/frontier_entropy_batch_reward": -0.2999406814575195,
"signal/accuracy_reward/centered_abs_mean": 0.13981119692325591,
"signal/accuracy_reward/group_std_mean": 0.18070069551467896,
"signal/accuracy_reward/group_zero_std_frac": 0.4972222328186035,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0410431146621704,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06990559846162796,
"signal/advantage_abs_mean": 0.7573927998542785,
"signal/advantage_pre_scale_abs_mean": 0.09159428179264069,
"signal/advantage_pre_scale_std": 0.15012040138244628,
"signal/advantage_std": 0.9829983353614807,
"signal/batch_coverage_0/centered_abs_mean": 0.15274601578712463,
"signal/batch_coverage_0/group_std_mean": 0.1947482168674469,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03270627558231354,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002184268040582538,
"signal/batch_coverage_1/centered_abs_mean": 0.15274601578712463,
"signal/batch_coverage_1/group_std_mean": 0.1947482168674469,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03270627558231354,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002184268040582538,
"signal/batch_coverage_10/centered_abs_mean": 0.16684764623641968,
"signal/batch_coverage_10/group_std_mean": 0.2129494309425354,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03565559573471546,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0023859212175011634,
"signal/batch_coverage_15/centered_abs_mean": 0.17019396722316743,
"signal/batch_coverage_15/group_std_mean": 0.2171155631542206,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03635375127196312,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002433773782104254,
"signal/batch_coverage_20/centered_abs_mean": 0.17045858800411223,
"signal/batch_coverage_20/group_std_mean": 0.21784802675247192,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.036375219374895094,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0024375576991587876,
"signal/batch_coverage_25/centered_abs_mean": 0.17278562784194945,
"signal/batch_coverage_25/group_std_mean": 0.22063855230808258,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03690861836075783,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002470834506675601,
"signal/batch_coverage_5/centered_abs_mean": 0.16150131821632385,
"signal/batch_coverage_5/group_std_mean": 0.20543525516986846,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03454604744911194,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023094687378033994,
"signal/brier_reward/centered_abs_mean": 0.11990782469511033,
"signal/brier_reward/group_std_mean": 0.15653329491615295,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17853015959262847,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011990783363580703,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02739548645913601,
"signal/confidence_uniqueness_reward/group_std_mean": 0.043045850843191145,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0408272460103035,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0027395487297326325,
"signal/format_reward/centered_abs_mean": 0.01424696184694767,
"signal/format_reward/group_std_mean": 0.02710859403014183,
"signal/format_reward/group_zero_std_frac": 0.8861111164093017,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.10568692535161972,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007123480923473835,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32398450970649717,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3915144741535187,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4832557141780853,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03239845186471939,
"step": 180
},
{
"calibration/aurc": 0.15428302573115638,
"calibration/batch_distribution_entropy": 0.9357928315322835,
"calibration/buffer_distribution_entropy": 0.9662215895426162,
"calibration/confidence_entropy": 0.465163569917188,
"calibration/coverage@0%": 0.027238980153342723,
"calibration/coverage@1%": 0.027238980153342723,
"calibration/coverage@10%": 0.23795636954751256,
"calibration/coverage@15%": 0.6718151575126063,
"calibration/coverage@20%": 0.751713937740079,
"calibration/coverage@25%": 0.9252781413612565,
"calibration/coverage@30%": 0.972780322862129,
"calibration/coverage@5%": 0.0916245972827669,
"calibration/ece": 0.17777518535858378,
"calibration/mean_confidence": 0.6189465787988068,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00425347222222221,
"completions/max_length": 3489.6,
"completions/max_terminated_length": 3489.6,
"completions/mean_length": 982.99912109375,
"completions/mean_terminated_length": 987.1817993164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 304.2,
"epoch": 0.44399445006937416,
"grad_norm": 0.002235629130154848,
"learning_rate": 6.927710843373495e-07,
"loss": -0.0098,
"num_tokens": 447570413.0,
"reward": 1.0250831365585327,
"reward_std": 0.1192633032798767,
"rewards/accuracy_reward": 0.6859375,
"rewards/batch_coverage_0": 0.33424278497695925,
"rewards/batch_coverage_1": 0.33424278497695925,
"rewards/batch_coverage_10": 0.40887650847435,
"rewards/batch_coverage_15": 0.4263133108615875,
"rewards/batch_coverage_20": 0.43644038438796995,
"rewards/batch_coverage_25": 0.4396053493022919,
"rewards/batch_coverage_5": 0.3843961298465729,
"rewards/brier_reward": 0.8267574667930603,
"rewards/confidence_uniqueness_reward": 0.9412113428115845,
"rewards/format_reward": 0.9952256917953491,
"rewards/frontier_entropy_batch_reward": -0.31822256445884706,
"signal/accuracy_reward/centered_abs_mean": 0.14396701455116273,
"signal/accuracy_reward/group_std_mean": 0.1907973110675812,
"signal/accuracy_reward/group_zero_std_frac": 0.4527777791023254,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0355172395706176,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07198350727558137,
"signal/advantage_abs_mean": 0.75169757604599,
"signal/advantage_pre_scale_abs_mean": 0.08948174864053726,
"signal/advantage_pre_scale_std": 0.1425995260477066,
"signal/advantage_std": 0.9830433845520019,
"signal/batch_coverage_0/centered_abs_mean": 0.15130364298820495,
"signal/batch_coverage_0/group_std_mean": 0.19145098626613616,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03131290823221207,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0021636419463902713,
"signal/batch_coverage_1/centered_abs_mean": 0.15130364298820495,
"signal/batch_coverage_1/group_std_mean": 0.19145098626613616,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03131290823221207,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0021636419463902713,
"signal/batch_coverage_10/centered_abs_mean": 0.1710590809583664,
"signal/batch_coverage_10/group_std_mean": 0.21738926470279693,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0353410042822361,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002446144772693515,
"signal/batch_coverage_15/centered_abs_mean": 0.1803262084722519,
"signal/batch_coverage_15/group_std_mean": 0.2301064521074295,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03722064346075058,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0025786647107452153,
"signal/batch_coverage_20/centered_abs_mean": 0.1807143956422806,
"signal/batch_coverage_20/group_std_mean": 0.2311330646276474,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.03731723800301552,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025842157658189533,
"signal/batch_coverage_25/centered_abs_mean": 0.1803894132375717,
"signal/batch_coverage_25/group_std_mean": 0.23168642818927765,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03726850301027298,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0025795686990022658,
"signal/batch_coverage_5/centered_abs_mean": 0.16504639387130737,
"signal/batch_coverage_5/group_std_mean": 0.20915851891040801,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03413994573056698,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0023601633962243795,
"signal/brier_reward/centered_abs_mean": 0.11856148093938827,
"signal/brier_reward/group_std_mean": 0.15411594808101653,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1713525176048279,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011856148391962052,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.024256234616041185,
"signal/confidence_uniqueness_reward/group_std_mean": 0.03698706589639187,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03531105667352676,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0024256234988570212,
"signal/format_reward/centered_abs_mean": 0.008620876912027597,
"signal/format_reward/group_std_mean": 0.018049365282058714,
"signal/format_reward/group_zero_std_frac": 0.919444453716278,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.06351001970469952,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0043104384560137985,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3371231436729431,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4051863729953766,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4865589618682861,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033712317049503324,
"step": 185
},
{
"calibration/aurc": 0.12534142353476713,
"calibration/batch_distribution_entropy": 0.8935035689275217,
"calibration/buffer_distribution_entropy": 0.9657430288586395,
"calibration/confidence_entropy": 0.44360261796641653,
"calibration/coverage@0%": 0.062130862681003,
"calibration/coverage@1%": 0.09181836268100299,
"calibration/coverage@10%": 0.5136960610219088,
"calibration/coverage@15%": 0.5486943991865573,
"calibration/coverage@20%": 0.8319114254606881,
"calibration/coverage@25%": 0.9504922758920801,
"calibration/coverage@30%": 0.9932291666666668,
"calibration/coverage@5%": 0.4307086185865148,
"calibration/ece": 0.15708990215467875,
"calibration/mean_confidence": 0.6694349975699834,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003038194444444442,
"completions/max_length": 3577.2,
"completions/max_terminated_length": 3577.2,
"completions/mean_length": 973.215625,
"completions/mean_terminated_length": 976.1800170898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 320.2,
"epoch": 0.45599430007124914,
"grad_norm": 0.002355813281610608,
"learning_rate": 5.421686746987952e-07,
"loss": -0.0063,
"num_tokens": 461864801.0,
"reward": 1.0505224466323853,
"reward_std": 0.10988152772188187,
"rewards/accuracy_reward": 0.7420138955116272,
"rewards/batch_coverage_0": 0.35429942011833193,
"rewards/batch_coverage_1": 0.35429942011833193,
"rewards/batch_coverage_10": 0.43965146541595457,
"rewards/batch_coverage_15": 0.446388041973114,
"rewards/batch_coverage_20": 0.4566673099994659,
"rewards/batch_coverage_25": 0.4625507116317749,
"rewards/batch_coverage_5": 0.40552425384521484,
"rewards/brier_reward": 0.8376203894615173,
"rewards/confidence_uniqueness_reward": 0.9377862334251403,
"rewards/format_reward": 0.9969618082046509,
"rewards/frontier_entropy_batch_reward": -0.38253204226493837,
"signal/accuracy_reward/centered_abs_mean": 0.13186849057674407,
"signal/accuracy_reward/group_std_mean": 0.17384477853775024,
"signal/accuracy_reward/group_zero_std_frac": 0.5000000178813935,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0667516231536864,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06593424528837204,
"signal/advantage_abs_mean": 0.7478165745735168,
"signal/advantage_pre_scale_abs_mean": 0.0822651669383049,
"signal/advantage_pre_scale_std": 0.1351626455783844,
"signal/advantage_std": 0.9828728079795838,
"signal/batch_coverage_0/centered_abs_mean": 0.15362754464149475,
"signal/batch_coverage_0/group_std_mean": 0.1938132733106613,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.03567029424011707,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0021968737710267305,
"signal/batch_coverage_1/centered_abs_mean": 0.15362754464149475,
"signal/batch_coverage_1/group_std_mean": 0.1938132733106613,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.03567029424011707,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0021968737710267305,
"signal/batch_coverage_10/centered_abs_mean": 0.17862621545791627,
"signal/batch_coverage_10/group_std_mean": 0.2278638958930969,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.041227002441883084,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002554354863241315,
"signal/batch_coverage_15/centered_abs_mean": 0.17997177839279174,
"signal/batch_coverage_15/group_std_mean": 0.22987563014030457,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.04153685420751572,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002573596499860287,
"signal/batch_coverage_20/centered_abs_mean": 0.18477944731712342,
"signal/batch_coverage_20/group_std_mean": 0.23605856001377107,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04267275333404541,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002642346080392599,
"signal/batch_coverage_25/centered_abs_mean": 0.18651285767555237,
"signal/batch_coverage_25/group_std_mean": 0.23866299986839296,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04308526143431664,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0026671338360756635,
"signal/batch_coverage_5/centered_abs_mean": 0.1672343373298645,
"signal/batch_coverage_5/group_std_mean": 0.21082105934619905,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03876652270555496,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002391451084986329,
"signal/brier_reward/centered_abs_mean": 0.11106197088956833,
"signal/brier_reward/group_std_mean": 0.1462969422340393,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17965082824230194,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01110619716346264,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.023622044920921327,
"signal/confidence_uniqueness_reward/group_std_mean": 0.03481765128672123,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03806819692254067,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0023622045759111644,
"signal/format_reward/centered_abs_mean": 0.005658637313172221,
"signal/format_reward/group_std_mean": 0.012903223745524883,
"signal/format_reward/group_zero_std_frac": 0.9388888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.04507702887058258,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0028293186565861105,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34954912662506105,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4182551920413971,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5644851803779602,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03495491221547127,
"step": 190
},
{
"calibration/aurc": 0.15241789162374858,
"calibration/batch_distribution_entropy": 0.9546151423561307,
"calibration/buffer_distribution_entropy": 0.9645976261126823,
"calibration/confidence_entropy": 0.46984322577054083,
"calibration/coverage@0%": 0.048566288114876136,
"calibration/coverage@1%": 0.13814962144820947,
"calibration/coverage@10%": 0.49193909973407013,
"calibration/coverage@15%": 0.6008877744998234,
"calibration/coverage@20%": 0.6417441775799271,
"calibration/coverage@25%": 0.6868293467699973,
"calibration/coverage@30%": 0.8711309729378772,
"calibration/coverage@5%": 0.38952757420411505,
"calibration/ece": 0.18659702684310667,
"calibration/mean_confidence": 0.5856564245526877,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006597222222222232,
"completions/max_length": 3366.8,
"completions/max_terminated_length": 3366.8,
"completions/mean_length": 995.2646850585937,
"completions/mean_terminated_length": 1001.8587280273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.8,
"epoch": 0.46799415007312406,
"grad_norm": 0.0021477374248206615,
"learning_rate": 3.91566265060241e-07,
"loss": -0.0182,
"num_tokens": 476411114.0,
"reward": 1.0307905673980713,
"reward_std": 0.10914622098207474,
"rewards/accuracy_reward": 0.6957465171813965,
"rewards/batch_coverage_0": 0.3612436711788177,
"rewards/batch_coverage_1": 0.3612436711788177,
"rewards/batch_coverage_10": 0.42941672205924986,
"rewards/batch_coverage_15": 0.4396386623382568,
"rewards/batch_coverage_20": 0.44845632314682005,
"rewards/batch_coverage_25": 0.4531115531921387,
"rewards/batch_coverage_5": 0.4056135654449463,
"rewards/brier_reward": 0.8242664337158203,
"rewards/confidence_uniqueness_reward": 0.9394988298416138,
"rewards/format_reward": 0.9933159708976745,
"rewards/frontier_entropy_batch_reward": -0.315689817070961,
"signal/accuracy_reward/centered_abs_mean": 0.11628146767616272,
"signal/accuracy_reward/group_std_mean": 0.15813961178064345,
"signal/accuracy_reward/group_zero_std_frac": 0.5277777791023255,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9150802969932557,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.05814073383808136,
"signal/advantage_abs_mean": 0.7424829006195068,
"signal/advantage_pre_scale_abs_mean": 0.08026361167430877,
"signal/advantage_pre_scale_std": 0.13525623083114624,
"signal/advantage_std": 0.9828639507293702,
"signal/batch_coverage_0/centered_abs_mean": 0.1481751650571823,
"signal/batch_coverage_0/group_std_mean": 0.1863509714603424,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.034362369775772096,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0021189047722145913,
"signal/batch_coverage_1/centered_abs_mean": 0.1481751650571823,
"signal/batch_coverage_1/group_std_mean": 0.1863509714603424,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.034362369775772096,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0021189047722145913,
"signal/batch_coverage_10/centered_abs_mean": 0.1680694818496704,
"signal/batch_coverage_10/group_std_mean": 0.21355166733264924,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03898909837007523,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0024033935274928807,
"signal/batch_coverage_15/centered_abs_mean": 0.17305652499198915,
"signal/batch_coverage_15/group_std_mean": 0.22033677697181703,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.040188524127006534,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0024747082497924566,
"signal/batch_coverage_20/centered_abs_mean": 0.17707459032535552,
"signal/batch_coverage_20/group_std_mean": 0.2258709490299225,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04103988632559776,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025321666151285173,
"signal/batch_coverage_25/centered_abs_mean": 0.1785203754901886,
"signal/batch_coverage_25/group_std_mean": 0.22794765830039979,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04126601964235306,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002552841370925307,
"signal/batch_coverage_5/centered_abs_mean": 0.16053160429000854,
"signal/batch_coverage_5/group_std_mean": 0.2028528332710266,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.037285619601607325,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002295601973310113,
"signal/brier_reward/centered_abs_mean": 0.11180263459682464,
"signal/brier_reward/group_std_mean": 0.1465301349759102,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17802698612213136,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011180263943970203,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.0259183157235384,
"signal/confidence_uniqueness_reward/group_std_mean": 0.038206398487091064,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04249723106622696,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0025918317027390004,
"signal/format_reward/centered_abs_mean": 0.011094835121184587,
"signal/format_reward/group_std_mean": 0.020084955915808677,
"signal/format_reward/group_zero_std_frac": 0.919444465637207,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.09070162996649742,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.005547417560592294,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32764663100242614,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3973482310771942,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5320763945579529,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03276466354727745,
"step": 195
},
{
"calibration/aurc": 0.12053504170385199,
"calibration/batch_distribution_entropy": 0.9277943160312582,
"calibration/buffer_distribution_entropy": 0.9649269249133663,
"calibration/confidence_entropy": 0.47151812106594904,
"calibration/coverage@0%": 0.04792212041884817,
"calibration/coverage@1%": 0.04792212041884817,
"calibration/coverage@10%": 0.550359947643979,
"calibration/coverage@15%": 0.6691781195462478,
"calibration/coverage@20%": 0.8827961387434555,
"calibration/coverage@25%": 0.9348958333333333,
"calibration/coverage@30%": 0.9635416666666667,
"calibration/coverage@5%": 0.36483693280977314,
"calibration/ece": 0.1565456610274869,
"calibration/mean_confidence": 0.6492343007308027,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005381944444444464,
"completions/max_length": 2979.8,
"completions/max_terminated_length": 2979.8,
"completions/mean_length": 960.2939331054688,
"completions/mean_terminated_length": 965.4896240234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 326.8,
"epoch": 0.47999400007499904,
"grad_norm": 0.002257067244499922,
"learning_rate": 2.409638554216868e-07,
"loss": -0.0133,
"num_tokens": 490541508.0,
"reward": 1.0378822088241577,
"reward_std": 0.10426538735628128,
"rewards/accuracy_reward": 0.7078993201255799,
"rewards/batch_coverage_0": 0.3699501812458038,
"rewards/batch_coverage_1": 0.3699501812458038,
"rewards/batch_coverage_10": 0.43272061347961427,
"rewards/batch_coverage_15": 0.4414961338043213,
"rewards/batch_coverage_20": 0.45003448724746703,
"rewards/batch_coverage_25": 0.45256100296974183,
"rewards/batch_coverage_5": 0.40719414353370664,
"rewards/brier_reward": 0.8301080942153931,
"rewards/confidence_uniqueness_reward": 0.9409601330757141,
"rewards/format_reward": 0.994531261920929,
"rewards/frontier_entropy_batch_reward": -0.32251755595207215,
"signal/accuracy_reward/centered_abs_mean": 0.10953233540058135,
"signal/accuracy_reward/group_std_mean": 0.15017634779214858,
"signal/accuracy_reward/group_zero_std_frac": 0.5555555582046509,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8703036189079285,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.05476616770029068,
"signal/advantage_abs_mean": 0.7531775951385498,
"signal/advantage_pre_scale_abs_mean": 0.07817077487707139,
"signal/advantage_pre_scale_std": 0.12902514189481734,
"signal/advantage_std": 0.982884156703949,
"signal/batch_coverage_0/centered_abs_mean": 0.15214686691761017,
"signal/batch_coverage_0/group_std_mean": 0.19155323207378389,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.034885770082473753,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.002175700105726719,
"signal/batch_coverage_1/centered_abs_mean": 0.15214686691761017,
"signal/batch_coverage_1/group_std_mean": 0.19155323207378389,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.034885770082473753,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.002175700105726719,
"signal/batch_coverage_10/centered_abs_mean": 0.17132368981838225,
"signal/batch_coverage_10/group_std_mean": 0.21766560673713684,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.039314419776201245,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.00244992864318192,
"signal/batch_coverage_15/centered_abs_mean": 0.174043932557106,
"signal/batch_coverage_15/group_std_mean": 0.2217872440814972,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03994949609041214,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002488828217610717,
"signal/batch_coverage_20/centered_abs_mean": 0.17794378995895385,
"signal/batch_coverage_20/group_std_mean": 0.22746247053146362,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04086562618613243,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0025445961393415926,
"signal/batch_coverage_25/centered_abs_mean": 0.17909466624259948,
"signal/batch_coverage_25/group_std_mean": 0.22904041409492493,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04110808596014977,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0025610537733882664,
"signal/batch_coverage_5/centered_abs_mean": 0.16287958323955537,
"signal/batch_coverage_5/group_std_mean": 0.20583013594150543,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0373820461332798,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.002329178061336279,
"signal/brier_reward/centered_abs_mean": 0.10916592478752137,
"signal/brier_reward/group_std_mean": 0.1431722342967987,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17486326694488524,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.010916592925786972,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02177223116159439,
"signal/confidence_uniqueness_reward/group_std_mean": 0.029672250896692277,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0346968125551939,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0021772230742499233,
"signal/format_reward/centered_abs_mean": 0.007644314225763083,
"signal/format_reward/group_std_mean": 0.0121083602309227,
"signal/format_reward/group_zero_std_frac": 0.9555555701255798,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.05982875004410744,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0038221571128815414,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32661651968955996,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3946535289287567,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.524556452035904,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03266165219247341,
"step": 200
},
{
"epoch": 0.47999400007499904,
"eval_calibration/aurc": 0.10989090327570772,
"eval_calibration/batch_distribution_entropy": 0.9192275429652724,
"eval_calibration/buffer_distribution_entropy": 0.9643264329465896,
"eval_calibration/confidence_entropy": 0.4607454734710889,
"eval_calibration/coverage@0%": 0.296875,
"eval_calibration/coverage@1%": 0.296875,
"eval_calibration/coverage@10%": 0.6458333333333334,
"eval_calibration/coverage@15%": 0.7447916666666666,
"eval_calibration/coverage@20%": 0.8385416666666666,
"eval_calibration/coverage@25%": 0.9010416666666666,
"eval_calibration/coverage@30%": 0.9739583333333334,
"eval_calibration/coverage@5%": 0.40625,
"eval_calibration/ece": 0.20959003645833332,
"eval_calibration/mean_confidence": 0.5790771197916667,
"eval_completions/clipped_ratio": 0.006076388888888895,
"eval_completions/max_length": 2619.8333333333335,
"eval_completions/max_terminated_length": 2619.8333333333335,
"eval_completions/mean_length": 977.114247639974,
"eval_completions/mean_terminated_length": 983.1600341796875,
"eval_completions/min_length": 153.0,
"eval_completions/min_terminated_length": 395.3333333333333,
"eval_loss": 0.0,
"eval_num_tokens": 490541508.0,
"eval_reward": 0.9246361056963602,
"eval_reward_std": 0.24066531658172607,
"eval_rewards/accuracy_reward": 0.6909722288449606,
"eval_rewards/batch_coverage_0": 0.04255757279073199,
"eval_rewards/batch_coverage_1": 0.04255757279073199,
"eval_rewards/batch_coverage_10": 0.07763015168408553,
"eval_rewards/batch_coverage_15": 0.119985976566871,
"eval_rewards/batch_coverage_20": 0.1694452352821827,
"eval_rewards/batch_coverage_25": 0.2227772449453672,
"eval_rewards/batch_coverage_5": 0.05227204862361153,
"eval_rewards/brier_reward": 0.8235170344511668,
"eval_rewards/confidence_uniqueness_reward": 0.8882950445016226,
"eval_rewards/format_reward": 0.9939236144224802,
"eval_rewards/frontier_entropy_batch_reward": -0.9939236144224802,
"eval_runtime": 183.9893,
"eval_samples_per_second": 5.435,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4173177083333333,
"eval_signal/accuracy_reward/group_std_mean": 0.46319297949473065,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8747199972470602,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.20865885416666666,
"eval_signal/advantage_abs_mean": 0.8748045464356741,
"eval_signal/advantage_pre_scale_abs_mean": 0.21066379050413767,
"eval_signal/advantage_pre_scale_std": 0.2384971926609675,
"eval_signal/advantage_std": 0.9864106674989065,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.1913293475906054,
"eval_signal/batch_coverage_0/group_std_mean": 0.2734084377686183,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011484552950908741,
"eval_signal/batch_coverage_0/weight": 0.014299999922513962,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.0027360095409676433,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.1913293475906054,
"eval_signal/batch_coverage_1/group_std_mean": 0.2734084377686183,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011484552950908741,
"eval_signal/batch_coverage_1/weight": 0.014299999922513962,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.0027360095409676433,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.14493575568000475,
"eval_signal/batch_coverage_10/group_std_mean": 0.19580343117316565,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.008692070453738173,
"eval_signal/batch_coverage_10/weight": 0.014299999922513962,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.002072581206448376,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.15486098205049834,
"eval_signal/batch_coverage_15/group_std_mean": 0.19644736250241598,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.00930007042673727,
"eval_signal/batch_coverage_15/weight": 0.014299999922513962,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.002214512108669927,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.18681678672631583,
"eval_signal/batch_coverage_20/group_std_mean": 0.23144895086685816,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.011187337494144836,
"eval_signal/batch_coverage_20/weight": 0.014299999922513962,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.002671479946002364,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.23808404554923376,
"eval_signal/batch_coverage_25/group_std_mean": 0.28926754742860794,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.014209352588901917,
"eval_signal/batch_coverage_25/weight": 0.014299999922513962,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.0034046017875274024,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.17653791109720865,
"eval_signal/batch_coverage_5/group_std_mean": 0.24955658366282782,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.010630226538827022,
"eval_signal/batch_coverage_5/weight": 0.014299999922513962,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.002524492136823634,
"eval_signal/brier_reward/centered_abs_mean": 0.17701477309068045,
"eval_signal/brier_reward/group_std_mean": 0.23732628921667734,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07436835144956906,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.017701477278023958,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.04846427279214064,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.07100162468850613,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.020224129781126976,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004846427279214065,
"eval_signal/format_reward/centered_abs_mean": 0.011664496424297491,
"eval_signal/format_reward/group_std_mean": 0.031383837262789406,
"eval_signal/format_reward/group_zero_std_frac": 0.8333333532015482,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.02375806588679552,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.0058322482121487456,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.011664496424297491,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.031383837262789406,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.8333333532015482,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.004751613363623619,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0011664496657128136,
"eval_steps_per_second": 0.033,
"step": 200
},
{
"calibration/aurc": 0.13501105600234406,
"calibration/batch_distribution_entropy": 0.9435655969085556,
"calibration/buffer_distribution_entropy": 0.9648407300747447,
"calibration/confidence_entropy": 0.45358109081944986,
"calibration/coverage@0%": 0.020366075348682823,
"calibration/coverage@1%": 0.020366075348682823,
"calibration/coverage@10%": 0.46117268003382444,
"calibration/coverage@15%": 0.5934764138336502,
"calibration/coverage@20%": 0.8597784983913943,
"calibration/coverage@25%": 0.9260988297446133,
"calibration/coverage@30%": 0.9656213979196467,
"calibration/coverage@5%": 0.19665257230616368,
"calibration/ece": 0.14935273101890895,
"calibration/mean_confidence": 0.610298141093447,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3466.0,
"completions/max_terminated_length": 3466.0,
"completions/mean_length": 990.8876098632812,
"completions/mean_terminated_length": 994.7953857421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 312.6,
"epoch": 0.491993850076874,
"grad_norm": 0.002186190104112029,
"learning_rate": 9.036144578313253e-08,
"loss": -0.005,
"num_tokens": 505022485.0,
"reward": 1.0617381572723388,
"reward_std": 0.10663122683763504,
"rewards/accuracy_reward": 0.7565972208976746,
"rewards/batch_coverage_0": 0.370861279964447,
"rewards/batch_coverage_1": 0.370861279964447,
"rewards/batch_coverage_10": 0.4283327877521515,
"rewards/batch_coverage_15": 0.43600705862045286,
"rewards/batch_coverage_20": 0.44471372961997985,
"rewards/batch_coverage_25": 0.44584651589393615,
"rewards/batch_coverage_5": 0.40272694230079653,
"rewards/brier_reward": 0.8254562616348267,
"rewards/confidence_uniqueness_reward": 0.941547405719757,
"rewards/format_reward": 0.9959201335906982,
"rewards/frontier_entropy_batch_reward": -0.32681636810302733,
"signal/accuracy_reward/centered_abs_mean": 0.12428385466337204,
"signal/accuracy_reward/group_std_mean": 0.1633887231349945,
"signal/accuracy_reward/group_zero_std_frac": 0.5333333432674408,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9983546733856201,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06214192733168602,
"signal/advantage_abs_mean": 0.7620988607406616,
"signal/advantage_pre_scale_abs_mean": 0.08042153120040893,
"signal/advantage_pre_scale_std": 0.1311565786600113,
"signal/advantage_std": 0.9828817129135132,
"signal/batch_coverage_0/centered_abs_mean": 0.1498122900724411,
"signal/batch_coverage_0/group_std_mean": 0.1872162103652954,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.034457380324602126,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0021423157304525375,
"signal/batch_coverage_1/centered_abs_mean": 0.1498122900724411,
"signal/batch_coverage_1/group_std_mean": 0.1872162103652954,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.034457380324602126,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0021423157304525375,
"signal/batch_coverage_10/centered_abs_mean": 0.1685813695192337,
"signal/batch_coverage_10/group_std_mean": 0.21188536584377288,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03877582773566246,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0024107135832309723,
"signal/batch_coverage_15/centered_abs_mean": 0.16964525282382964,
"signal/batch_coverage_15/group_std_mean": 0.213523331284523,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.03902594596147537,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0024259272031486034,
"signal/batch_coverage_20/centered_abs_mean": 0.1723014682531357,
"signal/batch_coverage_20/group_std_mean": 0.21752589643001558,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.039649903774261475,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.002463910961523652,
"signal/batch_coverage_25/centered_abs_mean": 0.17293465435504912,
"signal/batch_coverage_25/group_std_mean": 0.2184917449951172,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03979569748044014,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002472965605556965,
"signal/batch_coverage_5/centered_abs_mean": 0.15981419682502745,
"signal/batch_coverage_5/group_std_mean": 0.1999186486005783,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.03676430657505989,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0022853429429233072,
"signal/brier_reward/centered_abs_mean": 0.11391685307025909,
"signal/brier_reward/group_std_mean": 0.14772746860980987,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18313678205013276,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011391685344278812,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.02226530760526657,
"signal/confidence_uniqueness_reward/group_std_mean": 0.03310300558805466,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03581138141453266,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002226530876941979,
"signal/format_reward/centered_abs_mean": 0.006971571210306138,
"signal/format_reward/group_std_mean": 0.014379321224987507,
"signal/format_reward/group_zero_std_frac": 0.9361111283302307,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.056047255359590056,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.003485785605153069,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3415841281414032,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40630670785903933,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5496011257171631,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03415841460227966,
"step": 205
},
{
"calibration/aurc": 0.0754442216982998,
"calibration/batch_distribution_entropy": 0.9318504533202431,
"calibration/buffer_distribution_entropy": 0.9649572636750735,
"calibration/confidence_entropy": 0.46937859993147657,
"calibration/coverage@0%": 0.04861111111111111,
"calibration/coverage@1%": 0.2829861111111111,
"calibration/coverage@10%": 0.7209869995648389,
"calibration/coverage@15%": 0.8435097548592979,
"calibration/coverage@20%": 0.9209094865100087,
"calibration/coverage@25%": 0.9791666666666666,
"calibration/coverage@30%": 1.0,
"calibration/coverage@5%": 0.583623440673049,
"calibration/ece": 0.1433574382728641,
"calibration/mean_confidence": 0.6334517717013889,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0021701388888889137,
"completions/max_length": 3671.0,
"completions/max_terminated_length": 3671.0,
"completions/mean_length": 987.3819783528646,
"completions/mean_terminated_length": 989.5008951822916,
"completions/min_length": 0.0,
"completions/min_terminated_length": 290.6666666666667,
"epoch": 0.49919376007799904,
"num_tokens": 513713317.0,
"reward": 1.0401251713434856,
"reward_std": 0.10764590402444203,
"rewards/accuracy_reward": 0.7122395833333334,
"rewards/batch_coverage_0": 0.3732293943564097,
"rewards/batch_coverage_1": 0.3732293943564097,
"rewards/batch_coverage_10": 0.4244362413883209,
"rewards/batch_coverage_15": 0.4366017282009125,
"rewards/batch_coverage_20": 0.44373422861099243,
"rewards/batch_coverage_25": 0.4510187606016795,
"rewards/batch_coverage_5": 0.39601942896842957,
"rewards/brier_reward": 0.8295948306719462,
"rewards/confidence_uniqueness_reward": 0.9432157675425211,
"rewards/format_reward": 0.9976851940155029,
"rewards/frontier_entropy_batch_reward": -0.33563558260599774,
"signal/accuracy_reward/centered_abs_mean": 0.12830042093992233,
"signal/accuracy_reward/group_std_mean": 0.16989988088607788,
"signal/accuracy_reward/group_zero_std_frac": 0.5138888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9699827035268148,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06415021046996117,
"signal/advantage_abs_mean": 0.7430837154388428,
"signal/advantage_pre_scale_abs_mean": 0.07963799188534419,
"signal/advantage_pre_scale_std": 0.12994669129451117,
"signal/advantage_std": 0.9829309582710266,
"signal/batch_coverage_0/centered_abs_mean": 0.16159847378730774,
"signal/batch_coverage_0/group_std_mean": 0.20468567808469137,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0359560064971447,
"signal/batch_coverage_0/weight": 0.014299999922513962,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00231085818571349,
"signal/batch_coverage_1/centered_abs_mean": 0.16159847378730774,
"signal/batch_coverage_1/group_std_mean": 0.20468567808469137,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0359560064971447,
"signal/batch_coverage_1/weight": 0.014299999922513962,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.00231085818571349,
"signal/batch_coverage_10/centered_abs_mean": 0.17450785636901855,
"signal/batch_coverage_10/group_std_mean": 0.22163488964239755,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.03893564393122991,
"signal/batch_coverage_10/weight": 0.014299999922513962,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.00249546238531669,
"signal/batch_coverage_15/centered_abs_mean": 0.18060380717118582,
"signal/batch_coverage_15/group_std_mean": 0.23038248717784882,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.04025426755348841,
"signal/batch_coverage_15/weight": 0.014299999922513962,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0025826343335211277,
"signal/batch_coverage_20/centered_abs_mean": 0.18379070361455283,
"signal/batch_coverage_20/group_std_mean": 0.23500894010066986,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.04088542362054189,
"signal/batch_coverage_20/weight": 0.014299999922513962,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0026282070515056453,
"signal/batch_coverage_25/centered_abs_mean": 0.18744313220183054,
"signal/batch_coverage_25/group_std_mean": 0.23961820701758066,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.04164646069208781,
"signal/batch_coverage_25/weight": 0.014299999922513962,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.002680436708033085,
"signal/batch_coverage_5/centered_abs_mean": 0.168576513727506,
"signal/batch_coverage_5/group_std_mean": 0.2129517843325933,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0375792533159256,
"signal/batch_coverage_5/weight": 0.014299999922513962,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0024106441996991634,
"signal/brier_reward/centered_abs_mean": 0.11503704637289047,
"signal/brier_reward/group_std_mean": 0.1499533106883367,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1776889761288961,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011503705444435278,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.01900616039832433,
"signal/confidence_uniqueness_reward/group_std_mean": 0.030254953851302464,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02986216110487779,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0019006160631154974,
"signal/format_reward/centered_abs_mean": 0.004466869169846177,
"signal/format_reward/group_std_mean": 0.012596335262060165,
"signal/format_reward/group_zero_std_frac": 0.9305555820465088,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.035761150221029915,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0022334345849230886,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33474533756573993,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40274082620938617,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5223466257254282,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033474533508221306,
"step": 208,
"total_flos": 0.0,
"train_loss": -0.03366587917886276,
"train_runtime": 42091.8671,
"train_samples_per_second": 0.356,
"train_steps_per_second": 0.005
}
],
"logging_steps": 5,
"max_steps": 208,
"num_input_tokens_seen": 513713317,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}