Files
RLCR-v4-ks-highcov-batch-co…/trainer_state.json
ModelHub XC f151254d48 初始化项目,由ModelHub XC社区提供模型
Model: hector-gr/RLCR-v4-ks-highcov-batch-cold-math
Source: Original Platform
2026-05-28 00:37:26 +08:00

6365 lines
400 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.49919376007799904,
"eval_steps": 50,
"global_step": 208,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calibration/aurc": 0.5072314549050652,
"calibration/batch_distribution_entropy": 0.25912438679992567,
"calibration/confidence_entropy": 0.21267152309193102,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.4754453653490055,
"calibration/mean_confidence": 0.9199576586928551,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.019357638888888883,
"completions/max_length": 4044.2,
"completions/max_terminated_length": 4044.2,
"completions/mean_length": 522.4719482421875,
"completions/mean_terminated_length": 532.7948364257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.011999850001874977,
"grad_norm": 0.0038337723817676306,
"learning_rate": 5.952380952380953e-07,
"loss": 0.006,
"num_tokens": 9133085.0,
"reward": 0.4843825876712799,
"reward_std": 0.4449997007846832,
"rewards/accuracy_reward": 0.2596354126930237,
"rewards/batch_coverage_0": 0.006870156154036522,
"rewards/batch_coverage_1": 0.006870156154036522,
"rewards/batch_coverage_10": 0.01887576384469867,
"rewards/batch_coverage_15": 0.034924595057964324,
"rewards/batch_coverage_20": 0.060147954523563384,
"rewards/batch_coverage_25": 0.07294031232595444,
"rewards/batch_coverage_5": 0.00987731795758009,
"rewards/brier_reward": 0.31120150685310366,
"rewards/confidence_uniqueness_reward": 0.2898163080215454,
"rewards/format_reward": 0.5999999880790711,
"rewards/frontier_aurc_reward": 0.27386353611946107,
"rewards/frontier_ece_reward": 0.27386353611946107,
"rewards/frontier_entropy_batch_reward": -0.5739716529846192,
"signal/accuracy_reward/centered_abs_mean": 0.308056640625,
"signal/accuracy_reward/group_std_mean": 0.3664651155471802,
"signal/accuracy_reward/group_zero_std_frac": 0.09722222313284874,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1540283203125,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.1540283203125,
"signal/advantage_abs_mean": 0.3868393898010254,
"signal/advantage_pre_scale_abs_mean": 0.3868393898010254,
"signal/advantage_pre_scale_std": 0.4560303032398224,
"signal/advantage_std": 0.4560303032398224,
"signal/batch_coverage_0/centered_abs_mean": 0.015426980517804622,
"signal/batch_coverage_0/group_std_mean": 0.03147497624158859,
"signal/batch_coverage_0/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0015426980331540107,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0015426980331540107,
"signal/batch_coverage_1/centered_abs_mean": 0.015426980517804622,
"signal/batch_coverage_1/group_std_mean": 0.03147497624158859,
"signal/batch_coverage_1/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0015426980331540107,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0015426980331540107,
"signal/batch_coverage_10/centered_abs_mean": 0.027048196457326413,
"signal/batch_coverage_10/group_std_mean": 0.04408616162836552,
"signal/batch_coverage_10/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.002704819617792964,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002704819617792964,
"signal/batch_coverage_15/centered_abs_mean": 0.045237339287996295,
"signal/batch_coverage_15/group_std_mean": 0.06460195034742355,
"signal/batch_coverage_15/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.004523734096437693,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.004523734096437693,
"signal/batch_coverage_20/centered_abs_mean": 0.081603392213583,
"signal/batch_coverage_20/group_std_mean": 0.10613198131322861,
"signal/batch_coverage_20/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.008160339388996362,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.008160339388996362,
"signal/batch_coverage_25/centered_abs_mean": 0.10508095026016236,
"signal/batch_coverage_25/group_std_mean": 0.13301836997270583,
"signal/batch_coverage_25/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.010508095007389784,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.010508095007389784,
"signal/batch_coverage_5/centered_abs_mean": 0.018093755841255187,
"signal/batch_coverage_5/group_std_mean": 0.03443591110408306,
"signal/batch_coverage_5/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.001809375500306487,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.001809375500306487,
"signal/brier_reward/centered_abs_mean": 0.31866692304611205,
"signal/brier_reward/group_std_mean": 0.3707259178161621,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03186669200658798,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.03186669200658798,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.23531839847564698,
"signal/confidence_uniqueness_reward/group_std_mean": 0.28704230189323426,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.023531839624047278,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.023531839624047278,
"signal/format_reward/centered_abs_mean": 0.43978949785232546,
"signal/format_reward/group_std_mean": 0.47480629086494447,
"signal/format_reward/group_zero_std_frac": 0.0,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.21989474892616273,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.21989474892616273,
"signal/frontier_aurc_reward/centered_abs_mean": 0.30898670554161073,
"signal/frontier_aurc_reward/group_std_mean": 0.3652248322963715,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.0038623338099569083,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.0038623338099569083,
"signal/frontier_ece_reward/centered_abs_mean": 0.30898670554161073,
"signal/frontier_ece_reward/group_std_mean": 0.3652248322963715,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.030898670479655267,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.030898670479655267,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.4491934359073639,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.48202226758003236,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.0449193462729454,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0449193462729454,
"step": 5
},
{
"calibration/aurc": 0.4854229790205542,
"calibration/batch_distribution_entropy": 0.20949222832148623,
"calibration/confidence_entropy": 0.20656387188253925,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.44620888545316256,
"calibration/mean_confidence": 0.9278155657407705,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01649305555555556,
"completions/max_length": 3907.8,
"completions/max_terminated_length": 3907.8,
"completions/mean_length": 477.81553955078124,
"completions/mean_terminated_length": 486.00384521484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.2,
"epoch": 0.023999700003749954,
"grad_norm": 0.004502618685364723,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.0054,
"num_tokens": 17720240.0,
"reward": 0.547285407781601,
"reward_std": 0.4114623963832855,
"rewards/accuracy_reward": 0.2848090291023254,
"rewards/batch_coverage_0": 0.005022120475769043,
"rewards/batch_coverage_1": 0.005022120475769043,
"rewards/batch_coverage_10": 0.018738916981965302,
"rewards/batch_coverage_15": 0.033264560624957085,
"rewards/batch_coverage_20": 0.044974899291992186,
"rewards/batch_coverage_25": 0.05766131952404976,
"rewards/batch_coverage_5": 0.009930487908422946,
"rewards/brier_reward": 0.3467004060745239,
"rewards/confidence_uniqueness_reward": 0.341348922252655,
"rewards/format_reward": 0.704600703716278,
"rewards/frontier_aurc_reward": 0.2998257100582123,
"rewards/frontier_ece_reward": 0.2998257100582123,
"rewards/frontier_entropy_batch_reward": -0.6741620063781738,
"signal/accuracy_reward/centered_abs_mean": 0.3191785991191864,
"signal/accuracy_reward/group_std_mean": 0.37992130517959594,
"signal/accuracy_reward/group_zero_std_frac": 0.07222222462296486,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1595892995595932,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.1595892995595932,
"signal/advantage_abs_mean": 0.34345640540122985,
"signal/advantage_pre_scale_abs_mean": 0.34345640540122985,
"signal/advantage_pre_scale_std": 0.4176114320755005,
"signal/advantage_std": 0.4176114320755005,
"signal/batch_coverage_0/centered_abs_mean": 0.013590708374977112,
"signal/batch_coverage_0/group_std_mean": 0.028382838889956474,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0013590708374977111,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0013590708374977111,
"signal/batch_coverage_1/centered_abs_mean": 0.013590708374977112,
"signal/batch_coverage_1/group_std_mean": 0.028382838889956474,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0013590708374977111,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0013590708374977111,
"signal/batch_coverage_10/centered_abs_mean": 0.019663126021623612,
"signal/batch_coverage_10/group_std_mean": 0.03664802610874176,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0019663126906380056,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0019663126906380056,
"signal/batch_coverage_15/centered_abs_mean": 0.03258531279861927,
"signal/batch_coverage_15/group_std_mean": 0.052305429428815844,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.0032585313078016044,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.0032585313078016044,
"signal/batch_coverage_20/centered_abs_mean": 0.04690381959080696,
"signal/batch_coverage_20/group_std_mean": 0.06873543262481689,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.004690382117405534,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.004690382117405534,
"signal/batch_coverage_25/centered_abs_mean": 0.06806138753890992,
"signal/batch_coverage_25/group_std_mean": 0.09285385459661484,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.00680613899603486,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.00680613899603486,
"signal/batch_coverage_5/centered_abs_mean": 0.014666478522121907,
"signal/batch_coverage_5/group_std_mean": 0.030367295444011688,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0014666478615254163,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0014666478615254163,
"signal/brier_reward/centered_abs_mean": 0.3143719911575317,
"signal/brier_reward/group_std_mean": 0.3694416105747223,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03143719844520092,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.03143719844520092,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.22439135909080504,
"signal/confidence_uniqueness_reward/group_std_mean": 0.2801817238330841,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02243913747370243,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.02243913747370243,
"signal/format_reward/centered_abs_mean": 0.36346028447151185,
"signal/format_reward/group_std_mean": 0.4252608478069305,
"signal/format_reward/group_zero_std_frac": 0.002777777798473835,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.18173014223575593,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.18173014223575593,
"signal/frontier_aurc_reward/centered_abs_mean": 0.3132124185562134,
"signal/frontier_aurc_reward/group_std_mean": 0.3720038115978241,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.003915155259892345,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.003915155259892345,
"signal/frontier_ece_reward/centered_abs_mean": 0.3132124185562134,
"signal/frontier_ece_reward/group_std_mean": 0.3720038115978241,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.03132124207913876,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.03132124207913876,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.38731067180633544,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.444005960226059,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.038731067627668384,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.038731067627668384,
"step": 10
},
{
"calibration/aurc": 0.5478506539210682,
"calibration/batch_distribution_entropy": 0.2786014424283696,
"calibration/confidence_entropy": 0.22692472615850523,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.0,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.5215523162541775,
"calibration/mean_confidence": 0.9158861678111789,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01293402777777779,
"completions/max_length": 3933.8,
"completions/max_terminated_length": 3933.8,
"completions/mean_length": 427.5373291015625,
"completions/mean_terminated_length": 433.16363525390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.4,
"epoch": 0.03599955000562493,
"grad_norm": 0.0019597464706748724,
"learning_rate": 1.7857142857142859e-06,
"loss": -0.009,
"num_tokens": 25747454.0,
"reward": 0.6696977734565734,
"reward_std": 0.3273072600364685,
"rewards/accuracy_reward": 0.30598958134651183,
"rewards/batch_coverage_0": 0.012201336584985257,
"rewards/batch_coverage_1": 0.012201336584985257,
"rewards/batch_coverage_10": 0.028961936943233013,
"rewards/batch_coverage_15": 0.0324846439063549,
"rewards/batch_coverage_20": 0.03875870779156685,
"rewards/batch_coverage_25": 0.049946078658103944,
"rewards/batch_coverage_5": 0.01885376647114754,
"rewards/brier_reward": 0.40501958727836607,
"rewards/confidence_uniqueness_reward": 0.48818616271018983,
"rewards/format_reward": 0.9152777671813965,
"rewards/frontier_aurc_reward": 0.3329282164573669,
"rewards/frontier_ece_reward": 0.3329282164573669,
"rewards/frontier_entropy_batch_reward": -0.8705167293548584,
"signal/accuracy_reward/centered_abs_mean": 0.31670464277267457,
"signal/accuracy_reward/group_std_mean": 0.3770868182182312,
"signal/accuracy_reward/group_zero_std_frac": 0.07777777910232545,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.15835232138633729,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.15835232138633729,
"signal/advantage_abs_mean": 0.2627263814210892,
"signal/advantage_pre_scale_abs_mean": 0.2627263814210892,
"signal/advantage_pre_scale_std": 0.3356821537017822,
"signal/advantage_std": 0.3356821537017822,
"signal/batch_coverage_0/centered_abs_mean": 0.019925065338611603,
"signal/batch_coverage_0/group_std_mean": 0.03897041603922844,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.001992506510578096,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.001992506510578096,
"signal/batch_coverage_1/centered_abs_mean": 0.019925065338611603,
"signal/batch_coverage_1/group_std_mean": 0.03897041603922844,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.001992506510578096,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.001992506510578096,
"signal/batch_coverage_10/centered_abs_mean": 0.026121413335204124,
"signal/batch_coverage_10/group_std_mean": 0.04870801717042923,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.002612141496501863,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.002612141496501863,
"signal/batch_coverage_15/centered_abs_mean": 0.028343011811375617,
"signal/batch_coverage_15/group_std_mean": 0.0515684649348259,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.002834301325492561,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.002834301325492561,
"signal/batch_coverage_20/centered_abs_mean": 0.0341463316231966,
"signal/batch_coverage_20/group_std_mean": 0.05852316170930862,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.0034146332647651432,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.0034146332647651432,
"signal/batch_coverage_25/centered_abs_mean": 0.04928958341479302,
"signal/batch_coverage_25/group_std_mean": 0.07621403560042381,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.004928958602249622,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.004928958602249622,
"signal/batch_coverage_5/centered_abs_mean": 0.021870536357164384,
"signal/batch_coverage_5/group_std_mean": 0.04211917594075203,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0021870536962524056,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0021870536962524056,
"signal/brier_reward/centered_abs_mean": 0.3000001132488251,
"signal/brier_reward/group_std_mean": 0.35329429507255555,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03000001087784767,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.03000001087784767,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.18271521627902984,
"signal/confidence_uniqueness_reward/group_std_mean": 0.23359984457492827,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.018271520733833313,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.018271520733833313,
"signal/format_reward/centered_abs_mean": 0.13900824636220932,
"signal/format_reward/group_std_mean": 0.22593195140361785,
"signal/format_reward/group_zero_std_frac": 0.20833333656191827,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.06950412318110466,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.06950412318110466,
"signal/frontier_aurc_reward/centered_abs_mean": 0.3091962695121765,
"signal/frontier_aurc_reward/group_std_mean": 0.36607717275619506,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.0038649535737931727,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.0038649535737931727,
"signal/frontier_ece_reward/centered_abs_mean": 0.3091962695121765,
"signal/frontier_ece_reward/group_std_mean": 0.36607717275619506,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.03091962859034538,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.03091962859034538,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.20600322484970093,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3077341616153717,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.06666666939854622,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.02060032319277525,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.02060032319277525,
"step": 15
},
{
"calibration/aurc": 0.4821319044702136,
"calibration/batch_distribution_entropy": 0.3488917045405391,
"calibration/buffer_distribution_entropy": 0.2759978235953544,
"calibration/confidence_entropy": 0.2768365393170709,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.0,
"calibration/coverage@25%": 0.0,
"calibration/coverage@30%": 0.042105263157894736,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.42374968063460816,
"calibration/mean_confidence": 0.897871143815777,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010329861111111116,
"completions/max_length": 4012.0,
"completions/max_terminated_length": 4012.0,
"completions/mean_length": 444.3298645019531,
"completions/mean_terminated_length": 448.9945007324219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.6,
"epoch": 0.04799940000749991,
"grad_norm": 0.001022929442115128,
"learning_rate": 2.380952380952381e-06,
"loss": -0.0083,
"num_tokens": 33979830.0,
"reward": 0.7796724319458008,
"reward_std": 0.287617164850235,
"rewards/accuracy_reward": 0.4071180522441864,
"rewards/batch_coverage_0": 0.021281986311078072,
"rewards/batch_coverage_1": 0.021281986311078072,
"rewards/batch_coverage_10": 0.07034937888383866,
"rewards/batch_coverage_15": 0.07969716563820839,
"rewards/batch_coverage_20": 0.10704035162925721,
"rewards/batch_coverage_25": 0.1295667678117752,
"rewards/batch_coverage_5": 0.03042309693992138,
"rewards/brier_reward": 0.5185995876789093,
"rewards/confidence_uniqueness_reward": 0.5843602895736695,
"rewards/format_reward": 0.98203125,
"rewards/frontier_aurc_reward": 0.2070214475505054,
"rewards/frontier_ece_reward": 0.19309240709990264,
"rewards/frontier_entropy_batch_reward": -0.9305931091308594,
"signal/accuracy_reward/centered_abs_mean": 0.30668402314186094,
"signal/accuracy_reward/group_std_mean": 0.37011672258377076,
"signal/accuracy_reward/group_zero_std_frac": 0.08611111342906952,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.15334201157093047,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.15334201157093047,
"signal/advantage_abs_mean": 0.22955691814422607,
"signal/advantage_pre_scale_abs_mean": 0.22955691814422607,
"signal/advantage_pre_scale_std": 0.2956406891345978,
"signal/advantage_std": 0.2956406891345978,
"signal/batch_coverage_0/centered_abs_mean": 0.027153197303414346,
"signal/batch_coverage_0/group_std_mean": 0.0489169105887413,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0027153198141604664,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0027153198141604664,
"signal/batch_coverage_1/centered_abs_mean": 0.027153197303414346,
"signal/batch_coverage_1/group_std_mean": 0.0489169105887413,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0027153198141604664,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0027153198141604664,
"signal/batch_coverage_10/centered_abs_mean": 0.044522304087877274,
"signal/batch_coverage_10/group_std_mean": 0.07788674235343933,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.004452230548486114,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.004452230548486114,
"signal/batch_coverage_15/centered_abs_mean": 0.04999452456831932,
"signal/batch_coverage_15/group_std_mean": 0.08566224724054336,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.004999452503398061,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.004999452503398061,
"signal/batch_coverage_20/centered_abs_mean": 0.07234487235546112,
"signal/batch_coverage_20/group_std_mean": 0.11505262106657028,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.007234487310051918,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.007234487310051918,
"signal/batch_coverage_25/centered_abs_mean": 0.10189752876758576,
"signal/batch_coverage_25/group_std_mean": 0.15013812482357025,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.010189752839505673,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.010189752839505673,
"signal/batch_coverage_5/centered_abs_mean": 0.02882133685052395,
"signal/batch_coverage_5/group_std_mean": 0.05224255993962288,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.0028821338899433615,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.0028821338899433615,
"signal/brier_reward/centered_abs_mean": 0.276527601480484,
"signal/brier_reward/group_std_mean": 0.3314927339553833,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.027652759104967117,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.027652759104967117,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.15654111206531524,
"signal/confidence_uniqueness_reward/group_std_mean": 0.19497571289539337,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.015654110908508302,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.015654110908508302,
"signal/format_reward/centered_abs_mean": 0.03294813297688961,
"signal/format_reward/group_std_mean": 0.07249769493937493,
"signal/format_reward/group_zero_std_frac": 0.6666666746139527,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.016474066488444804,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.016474066488444804,
"signal/frontier_aurc_reward/centered_abs_mean": 0.150981783028692,
"signal/frontier_aurc_reward/group_std_mean": 0.18202955992892383,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 0.0018872723849199247,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 0.0018872723849199247,
"signal/frontier_ece_reward/centered_abs_mean": 0.2331768661737442,
"signal/frontier_ece_reward/group_std_mean": 0.2786339819431305,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.023317687213420868,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.023317687213420868,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.12025448828935623,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.22101308405399323,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.18888889104127884,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.012025448866188527,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.012025448866188527,
"step": 20
},
{
"calibration/aurc": 0.3805529542630866,
"calibration/batch_distribution_entropy": 0.5245517949798162,
"calibration/buffer_distribution_entropy": 0.32109023757926547,
"calibration/confidence_entropy": 0.3423178730715135,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.02198952879581152,
"calibration/coverage@20%": 0.10837696335078535,
"calibration/coverage@25%": 0.13298429319371727,
"calibration/coverage@30%": 0.21904761904761907,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.2970157676613897,
"calibration/mean_confidence": 0.8597930621346853,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010937499999999978,
"completions/max_length": 3883.6,
"completions/max_terminated_length": 3883.6,
"completions/mean_length": 481.8978332519531,
"completions/mean_terminated_length": 487.2155456542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.8,
"epoch": 0.05999925000937488,
"grad_norm": 0.0010147301945835352,
"learning_rate": 2.9761904761904763e-06,
"loss": -0.0063,
"num_tokens": 42655741.0,
"reward": 0.8809885144233703,
"reward_std": 0.2663578689098358,
"rewards/accuracy_reward": 0.5109375,
"rewards/batch_coverage_0": 0.05274242460727692,
"rewards/batch_coverage_1": 0.05274242460727692,
"rewards/batch_coverage_10": 0.156163290143013,
"rewards/batch_coverage_15": 0.17960385233163834,
"rewards/batch_coverage_20": 0.21484352350234986,
"rewards/batch_coverage_25": 0.2300809234380722,
"rewards/batch_coverage_5": 0.07413579896092415,
"rewards/brier_reward": 0.6292544364929199,
"rewards/confidence_uniqueness_reward": 0.6723939895629882,
"rewards/format_reward": 0.9863715410232544,
"rewards/frontier_aurc_reward": -0.004603662062436342,
"rewards/frontier_ece_reward": -0.0024937400594353676,
"rewards/frontier_entropy_batch_reward": -0.9355515599250793,
"signal/accuracy_reward/centered_abs_mean": 0.27855902910232544,
"signal/accuracy_reward/group_std_mean": 0.3460928499698639,
"signal/accuracy_reward/group_zero_std_frac": 0.1111111119389534,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.13927951455116272,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.13927951455116272,
"signal/advantage_abs_mean": 0.2031567931175232,
"signal/advantage_pre_scale_abs_mean": 0.2031567931175232,
"signal/advantage_pre_scale_std": 0.2749268710613251,
"signal/advantage_std": 0.2749268710613251,
"signal/batch_coverage_0/centered_abs_mean": 0.04045618698000908,
"signal/batch_coverage_0/group_std_mean": 0.06610891073942185,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.004045618698000908,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.004045618698000908,
"signal/batch_coverage_1/centered_abs_mean": 0.04045618698000908,
"signal/batch_coverage_1/group_std_mean": 0.06610891073942185,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.004045618698000908,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.004045618698000908,
"signal/batch_coverage_10/centered_abs_mean": 0.07423198148608208,
"signal/batch_coverage_10/group_std_mean": 0.12178197354078293,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.007423198502510786,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.007423198502510786,
"signal/batch_coverage_15/centered_abs_mean": 0.08689813762903213,
"signal/batch_coverage_15/group_std_mean": 0.14011010825634002,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.008689813409000635,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.008689813409000635,
"signal/batch_coverage_20/centered_abs_mean": 0.11602874398231507,
"signal/batch_coverage_20/group_std_mean": 0.1792426437139511,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.011602874659001828,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.011602874659001828,
"signal/batch_coverage_25/centered_abs_mean": 0.13573874533176422,
"signal/batch_coverage_25/group_std_mean": 0.20239817202091218,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.013573875278234481,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.013573875278234481,
"signal/batch_coverage_5/centered_abs_mean": 0.04396994709968567,
"signal/batch_coverage_5/group_std_mean": 0.07224964424967766,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.004396994784474373,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.004396994784474373,
"signal/brier_reward/centered_abs_mean": 0.2282465249300003,
"signal/brier_reward/group_std_mean": 0.282819801568985,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.022824652120471002,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.022824652120471002,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1486896753311157,
"signal/confidence_uniqueness_reward/group_std_mean": 0.178310364484787,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.014868967980146409,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.014868967980146409,
"signal/format_reward/centered_abs_mean": 0.02458224818110466,
"signal/format_reward/group_std_mean": 0.050415501743555066,
"signal/format_reward/group_zero_std_frac": 0.7805555701255799,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01229112409055233,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01229112409055233,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0032616199925541876,
"signal/frontier_aurc_reward/group_std_mean": 0.004778617806732654,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.0770251507638024e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.0770251507638024e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.1397606164216995,
"signal/frontier_ece_reward/group_std_mean": 0.16592053472995758,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.013976060971617698,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.013976060971617698,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.1111811563372612,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.2114289492368698,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.26111111640930174,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.011118116043508052,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.011118116043508052,
"step": 25
},
{
"calibration/aurc": 0.3004850488737162,
"calibration/batch_distribution_entropy": 0.6425691893665822,
"calibration/buffer_distribution_entropy": 0.39893373230297585,
"calibration/confidence_entropy": 0.3918151368055475,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.07807486631016043,
"calibration/coverage@25%": 0.23711550510717042,
"calibration/coverage@30%": 0.5197976414978907,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.17719548548718614,
"calibration/mean_confidence": 0.8239461402890431,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015104166666666674,
"completions/max_length": 3855.4,
"completions/max_terminated_length": 3855.4,
"completions/mean_length": 558.49384765625,
"completions/mean_terminated_length": 567.1655395507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.8,
"epoch": 0.07199910001124986,
"grad_norm": 0.0006667505949735641,
"learning_rate": 3.5714285714285718e-06,
"loss": -0.0094,
"num_tokens": 52199510.0,
"reward": 0.9673492670059204,
"reward_std": 0.25074209868907926,
"rewards/accuracy_reward": 0.5760416626930237,
"rewards/batch_coverage_0": 0.1101593405008316,
"rewards/batch_coverage_1": 0.1101593405008316,
"rewards/batch_coverage_10": 0.2120990425348282,
"rewards/batch_coverage_15": 0.23272857069969177,
"rewards/batch_coverage_20": 0.2680616557598114,
"rewards/batch_coverage_25": 0.2810434937477112,
"rewards/batch_coverage_5": 0.1666581004858017,
"rewards/brier_reward": 0.6929940223693848,
"rewards/confidence_uniqueness_reward": 0.7037853598594666,
"rewards/format_reward": 0.9827257037162781,
"rewards/frontier_aurc_reward": -0.0037181817460805178,
"rewards/frontier_ece_reward": 0.01464775917120278,
"rewards/frontier_entropy_batch_reward": -0.9122162461280823,
"signal/accuracy_reward/centered_abs_mean": 0.24763454794883727,
"signal/accuracy_reward/group_std_mean": 0.3104135751724243,
"signal/accuracy_reward/group_zero_std_frac": 0.1777777835726738,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.12381727397441863,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.12381727397441863,
"signal/advantage_abs_mean": 0.18919424712657928,
"signal/advantage_pre_scale_abs_mean": 0.18919424712657928,
"signal/advantage_pre_scale_std": 0.2605606704950333,
"signal/advantage_std": 0.2605606704950333,
"signal/batch_coverage_0/centered_abs_mean": 0.06281921863555909,
"signal/batch_coverage_0/group_std_mean": 0.09205281436443329,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0062819220125675205,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.0062819220125675205,
"signal/batch_coverage_1/centered_abs_mean": 0.06281921863555909,
"signal/batch_coverage_1/group_std_mean": 0.09205281436443329,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.0062819220125675205,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.0062819220125675205,
"signal/batch_coverage_10/centered_abs_mean": 0.0929559051990509,
"signal/batch_coverage_10/group_std_mean": 0.13886004090309143,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.009295590780675411,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.009295590780675411,
"signal/batch_coverage_15/centered_abs_mean": 0.10421416014432908,
"signal/batch_coverage_15/group_std_mean": 0.15459922552108765,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.010421415977180003,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.010421415977180003,
"signal/batch_coverage_20/centered_abs_mean": 0.13094892650842666,
"signal/batch_coverage_20/group_std_mean": 0.19057590663433074,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.013094892725348473,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.013094892725348473,
"signal/batch_coverage_25/centered_abs_mean": 0.1444714769721031,
"signal/batch_coverage_25/group_std_mean": 0.20690469741821288,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.014447147585451603,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.014447147585451603,
"signal/batch_coverage_5/centered_abs_mean": 0.0769407331943512,
"signal/batch_coverage_5/group_std_mean": 0.11383293271064758,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.007694073114544153,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.007694073114544153,
"signal/brier_reward/centered_abs_mean": 0.19581978023052216,
"signal/brier_reward/group_std_mean": 0.2459787905216217,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.019581978768110277,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.019581978768110277,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.15202451646327972,
"signal/confidence_uniqueness_reward/group_std_mean": 0.18180184066295624,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.01520245186984539,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.01520245186984539,
"signal/format_reward/centered_abs_mean": 0.02896592915058136,
"signal/format_reward/group_std_mean": 0.05586763843894005,
"signal/format_reward/group_zero_std_frac": 0.7666666626930236,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01448296457529068,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01448296457529068,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0027071074582636356,
"signal/frontier_aurc_reward/group_std_mean": 0.003949257265776396,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.3838843592093326e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.3838843592093326e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.09545410424470901,
"signal/frontier_ece_reward/group_std_mean": 0.1201583355665207,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.009545410610735416,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.009545410610735416,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.15066551864147187,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.27135526239871977,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.13888889029622078,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.015066551603376865,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.015066551603376865,
"step": 30
},
{
"calibration/aurc": 0.23015040121185343,
"calibration/batch_distribution_entropy": 0.5256467913204419,
"calibration/buffer_distribution_entropy": 0.44834076164911146,
"calibration/confidence_entropy": 0.26321980631678177,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.14310274655102243,
"calibration/coverage@20%": 0.4793298240583419,
"calibration/coverage@25%": 0.7428456379652152,
"calibration/coverage@30%": 0.8,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.18346354831539727,
"calibration/mean_confidence": 0.8748998868807243,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.020486111111111094,
"completions/max_length": 3840.6,
"completions/max_terminated_length": 3840.6,
"completions/mean_length": 615.801318359375,
"completions/mean_terminated_length": 628.7870483398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.4,
"epoch": 0.08399895001312484,
"grad_norm": 0.0009780606487765908,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0154,
"num_tokens": 62370981.0,
"reward": 1.074914562702179,
"reward_std": 0.24743359982967378,
"rewards/accuracy_reward": 0.6279513835906982,
"rewards/batch_coverage_0": 0.1995459720492363,
"rewards/batch_coverage_1": 0.1995459720492363,
"rewards/batch_coverage_10": 0.3318263113498688,
"rewards/batch_coverage_15": 0.35595046579837797,
"rewards/batch_coverage_20": 0.37790881991386416,
"rewards/batch_coverage_25": 0.3956703066825867,
"rewards/batch_coverage_5": 0.2661326140165329,
"rewards/brier_reward": 0.7158191680908204,
"rewards/confidence_uniqueness_reward": 0.6305435240268707,
"rewards/format_reward": 0.978124988079071,
"rewards/frontier_aurc_reward": -0.003513122210279107,
"rewards/frontier_ece_reward": 0.036505474848672746,
"rewards/frontier_entropy_batch_reward": -0.7902460932731629,
"signal/accuracy_reward/centered_abs_mean": 0.21014539897441864,
"signal/accuracy_reward/group_std_mean": 0.26572045087814333,
"signal/accuracy_reward/group_zero_std_frac": 0.2888888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.10507269948720932,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.10507269948720932,
"signal/advantage_abs_mean": 0.18241074681282043,
"signal/advantage_pre_scale_abs_mean": 0.18241074681282043,
"signal/advantage_pre_scale_std": 0.26696314215660094,
"signal/advantage_std": 0.26696314215660094,
"signal/batch_coverage_0/centered_abs_mean": 0.06104315221309662,
"signal/batch_coverage_0/group_std_mean": 0.09295270442962647,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.00610431581735611,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.00610431581735611,
"signal/batch_coverage_1/centered_abs_mean": 0.06104315221309662,
"signal/batch_coverage_1/group_std_mean": 0.09295270442962647,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.00610431581735611,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.00610431581735611,
"signal/batch_coverage_10/centered_abs_mean": 0.100413478910923,
"signal/batch_coverage_10/group_std_mean": 0.15636947602033616,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0100413478910923,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0100413478910923,
"signal/batch_coverage_15/centered_abs_mean": 0.11477141976356506,
"signal/batch_coverage_15/group_std_mean": 0.1760800987482071,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01147714201360941,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01147714201360941,
"signal/batch_coverage_20/centered_abs_mean": 0.13171382546424865,
"signal/batch_coverage_20/group_std_mean": 0.19760640561580659,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.013171382062137128,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.013171382062137128,
"signal/batch_coverage_25/centered_abs_mean": 0.1536375403404236,
"signal/batch_coverage_25/group_std_mean": 0.22362993359565736,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015363754145801068,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015363754145801068,
"signal/batch_coverage_5/centered_abs_mean": 0.07415727078914643,
"signal/batch_coverage_5/group_std_mean": 0.1162964329123497,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.007415727060288191,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.007415727060288191,
"signal/brier_reward/centered_abs_mean": 0.19005568623542785,
"signal/brier_reward/group_std_mean": 0.2425243467092514,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01900556981563568,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01900556981563568,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.23612670004367828,
"signal/confidence_uniqueness_reward/group_std_mean": 0.27312268912792204,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.023612670600414276,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.023612670600414276,
"signal/format_reward/centered_abs_mean": 0.03466796837747097,
"signal/format_reward/group_std_mean": 0.060341247171163556,
"signal/format_reward/group_zero_std_frac": 0.7666666746139527,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.017333984188735486,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.017333984188735486,
"signal/frontier_aurc_reward/centered_abs_mean": 0.004159660637378692,
"signal/frontier_aurc_reward/group_std_mean": 0.005809722188860178,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 5.1995759713463484e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 5.1995759713463484e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.09585424959659576,
"signal/frontier_ece_reward/group_std_mean": 0.12318123281002044,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.009585425443947315,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.009585425443947315,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.2937092065811157,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4011690020561218,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.027777778543531896,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.02937092147767544,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.02937092147767544,
"step": 35
},
{
"calibration/aurc": 0.24487568965136558,
"calibration/batch_distribution_entropy": 0.6500097565212745,
"calibration/buffer_distribution_entropy": 0.4660041553477049,
"calibration/confidence_entropy": 0.2837820801954671,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.0,
"calibration/coverage@15%": 0.0,
"calibration/coverage@20%": 0.4201621979024496,
"calibration/coverage@25%": 0.6752763614382925,
"calibration/coverage@30%": 0.849405582808501,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.17441238108133758,
"calibration/mean_confidence": 0.8161932812282213,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017100694444444443,
"completions/max_length": 3569.8,
"completions/max_terminated_length": 3569.8,
"completions/mean_length": 670.5376953125,
"completions/mean_terminated_length": 682.1771606445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.8,
"epoch": 0.09599880001499982,
"grad_norm": 0.0007898859912529588,
"learning_rate": 4.761904761904762e-06,
"loss": -0.017,
"num_tokens": 73215095.0,
"reward": 1.1504925727844237,
"reward_std": 0.256816166639328,
"rewards/accuracy_reward": 0.6532986164093018,
"rewards/batch_coverage_0": 0.22187545001506806,
"rewards/batch_coverage_1": 0.22187545001506806,
"rewards/batch_coverage_10": 0.3847365379333496,
"rewards/batch_coverage_15": 0.41597190499305725,
"rewards/batch_coverage_20": 0.437878692150116,
"rewards/batch_coverage_25": 0.45056607723236086,
"rewards/batch_coverage_5": 0.31669565439224245,
"rewards/brier_reward": 0.7430066347122193,
"rewards/confidence_uniqueness_reward": 0.6594228565692901,
"rewards/format_reward": 0.9816840291023254,
"rewards/frontier_aurc_reward": -0.0030309411231428385,
"rewards/frontier_ece_reward": 0.03790239319205284,
"rewards/frontier_entropy_batch_reward": -0.5595400869846344,
"signal/accuracy_reward/centered_abs_mean": 0.18483072519302368,
"signal/accuracy_reward/group_std_mean": 0.24741473495960237,
"signal/accuracy_reward/group_zero_std_frac": 0.28888889253139494,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09241536259651184,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09241536259651184,
"signal/advantage_abs_mean": 0.18649652004241943,
"signal/advantage_pre_scale_abs_mean": 0.18649652004241943,
"signal/advantage_pre_scale_std": 0.27345457673072815,
"signal/advantage_std": 0.27345457673072815,
"signal/batch_coverage_0/centered_abs_mean": 0.11488340944051742,
"signal/batch_coverage_0/group_std_mean": 0.16767307221889496,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011488340701907874,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.011488340701907874,
"signal/batch_coverage_1/centered_abs_mean": 0.11488340944051742,
"signal/batch_coverage_1/group_std_mean": 0.16767307221889496,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011488340701907874,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.011488340701907874,
"signal/batch_coverage_10/centered_abs_mean": 0.16001660525798797,
"signal/batch_coverage_10/group_std_mean": 0.23101125061511993,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.016001661494374275,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.016001661494374275,
"signal/batch_coverage_15/centered_abs_mean": 0.17803999781608582,
"signal/batch_coverage_15/group_std_mean": 0.254090678691864,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01780400052666664,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01780400052666664,
"signal/batch_coverage_20/centered_abs_mean": 0.1952661693096161,
"signal/batch_coverage_20/group_std_mean": 0.2745431065559387,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.019526617601513863,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.019526617601513863,
"signal/batch_coverage_25/centered_abs_mean": 0.2111252725124359,
"signal/batch_coverage_25/group_std_mean": 0.29278976321220396,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.0211125273257494,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.0211125273257494,
"signal/batch_coverage_5/centered_abs_mean": 0.13654999434947968,
"signal/batch_coverage_5/group_std_mean": 0.19951672554016114,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.013654999434947968,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.013654999434947968,
"signal/brier_reward/centered_abs_mean": 0.19350166916847228,
"signal/brier_reward/group_std_mean": 0.2490565538406372,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.019350168108940125,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.019350168108940125,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.24396952986717224,
"signal/confidence_uniqueness_reward/group_std_mean": 0.2847494125366211,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02439695280045271,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.02439695280045271,
"signal/format_reward/centered_abs_mean": 0.0307020403444767,
"signal/format_reward/group_std_mean": 0.05410040691494942,
"signal/format_reward/group_zero_std_frac": 0.7888888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01535102017223835,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01535102017223835,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0033766867127269506,
"signal/frontier_aurc_reward/group_std_mean": 0.004707662016153335,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 4.2208583545289e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 4.2208583545289e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.07757208049297333,
"signal/frontier_ece_reward/group_std_mean": 0.10421034693717957,
"signal/frontier_ece_reward/group_zero_std_frac": 0.002777777798473835,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0077572080306708814,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0077572080306708814,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3927412450313568,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.46981320977211,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.01111111119389534,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.03927412405610085,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03927412405610085,
"step": 40
},
{
"calibration/aurc": 0.16850152989275416,
"calibration/batch_distribution_entropy": 0.7358482569441548,
"calibration/buffer_distribution_entropy": 0.5184809949070356,
"calibration/confidence_entropy": 0.31368130358296725,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.09847723025533593,
"calibration/coverage@15%": 0.4264761977408342,
"calibration/coverage@20%": 0.7963623568945273,
"calibration/coverage@25%": 0.9617862772303235,
"calibration/coverage@30%": 1.0,
"calibration/coverage@5%": 0.021465968586387434,
"calibration/ece": 0.13108366387334294,
"calibration/mean_confidence": 0.7715716669142532,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015538194444444441,
"completions/max_length": 3910.0,
"completions/max_terminated_length": 3910.0,
"completions/mean_length": 737.1761474609375,
"completions/mean_terminated_length": 748.7971313476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 203.4,
"epoch": 0.1079986500168748,
"grad_norm": 0.000921767670661211,
"learning_rate": 4.909638554216868e-06,
"loss": -0.0162,
"num_tokens": 84842628.0,
"reward": 1.1769782543182372,
"reward_std": 0.2623761177062988,
"rewards/accuracy_reward": 0.6508680582046509,
"rewards/batch_coverage_0": 0.23462156355381011,
"rewards/batch_coverage_1": 0.23462156355381011,
"rewards/batch_coverage_10": 0.37513543367385865,
"rewards/batch_coverage_15": 0.40046226382255556,
"rewards/batch_coverage_20": 0.42590277194976806,
"rewards/batch_coverage_25": 0.4409227430820465,
"rewards/batch_coverage_5": 0.3107755959033966,
"rewards/brier_reward": 0.7588403224945068,
"rewards/confidence_uniqueness_reward": 0.8319248676300048,
"rewards/format_reward": 0.9840277671813965,
"rewards/frontier_aurc_reward": -0.0025125455809757113,
"rewards/frontier_ece_reward": 0.026418356224894524,
"rewards/frontier_entropy_batch_reward": -0.4440083742141724,
"signal/accuracy_reward/centered_abs_mean": 0.19178602397441863,
"signal/accuracy_reward/group_std_mean": 0.2533793181180954,
"signal/accuracy_reward/group_zero_std_frac": 0.2833333402872086,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09589301198720931,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09589301198720931,
"signal/advantage_abs_mean": 0.19616546034812926,
"signal/advantage_pre_scale_abs_mean": 0.19616546034812926,
"signal/advantage_pre_scale_std": 0.27432178854942324,
"signal/advantage_std": 0.27432178854942324,
"signal/batch_coverage_0/centered_abs_mean": 0.1771412819623947,
"signal/batch_coverage_0/group_std_mean": 0.24457992613315582,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.017714128270745276,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017714128270745276,
"signal/batch_coverage_1/centered_abs_mean": 0.1771412819623947,
"signal/batch_coverage_1/group_std_mean": 0.24457992613315582,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.017714128270745276,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.017714128270745276,
"signal/batch_coverage_10/centered_abs_mean": 0.21691608130931855,
"signal/batch_coverage_10/group_std_mean": 0.29502947330474855,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.021691609546542166,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.021691609546542166,
"signal/batch_coverage_15/centered_abs_mean": 0.22483428716659545,
"signal/batch_coverage_15/group_std_mean": 0.30313605070114136,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.022483428567647935,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.022483428567647935,
"signal/batch_coverage_20/centered_abs_mean": 0.2426200658082962,
"signal/batch_coverage_20/group_std_mean": 0.3239041268825531,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02426200732588768,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.02426200732588768,
"signal/batch_coverage_25/centered_abs_mean": 0.25766364932060243,
"signal/batch_coverage_25/group_std_mean": 0.3407094895839691,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.025766366347670554,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.025766366347670554,
"signal/batch_coverage_5/centered_abs_mean": 0.19479849338531494,
"signal/batch_coverage_5/group_std_mean": 0.2667528986930847,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01947984956204891,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01947984956204891,
"signal/brier_reward/centered_abs_mean": 0.19988948702812195,
"signal/brier_reward/group_std_mean": 0.25549029409885404,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.019988948851823805,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.019988948851823805,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1287672832608223,
"signal/confidence_uniqueness_reward/group_std_mean": 0.16340579688549042,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.012876728549599648,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.012876728549599648,
"signal/format_reward/centered_abs_mean": 0.02685546912252903,
"signal/format_reward/group_std_mean": 0.052517061680555345,
"signal/format_reward/group_zero_std_frac": 0.775,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.013427734561264514,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.013427734561264514,
"signal/frontier_aurc_reward/centered_abs_mean": 0.002215990168042481,
"signal/frontier_aurc_reward/group_std_mean": 0.0032423562835901974,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.76998780464055e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.76998780464055e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.05805379226803779,
"signal/frontier_ece_reward/group_std_mean": 0.07890773713588714,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.005805379338562488,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.005805379338562488,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.40570557713508604,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.48176873922348024,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.002777777798473835,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.04057055860757828,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.04057055860757828,
"step": 45
},
{
"calibration/aurc": 0.3657922181935306,
"calibration/batch_distribution_entropy": 0.7370499011455982,
"calibration/buffer_distribution_entropy": 0.5546174449056582,
"calibration/confidence_entropy": 0.2774853426801891,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.01078167115902965,
"calibration/coverage@15%": 0.011320754716981133,
"calibration/coverage@20%": 0.06305855243974426,
"calibration/coverage@25%": 0.23091601402664813,
"calibration/coverage@30%": 0.4311086853697116,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.25106672354368215,
"calibration/mean_confidence": 0.6794906292963364,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013194444444444463,
"completions/max_length": 3656.0,
"completions/max_terminated_length": 3656.0,
"completions/mean_length": 794.4783081054687,
"completions/mean_terminated_length": 805.1396240234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.11999850001874976,
"grad_norm": 0.0008140106801874936,
"learning_rate": 4.759036144578314e-06,
"loss": -0.0145,
"num_tokens": 97092618.0,
"reward": 1.1897046327590943,
"reward_std": 0.268722853064537,
"rewards/accuracy_reward": 0.6429687380790711,
"rewards/batch_coverage_0": 0.2885128676891327,
"rewards/batch_coverage_1": 0.2885128676891327,
"rewards/batch_coverage_10": 0.41407533288002013,
"rewards/batch_coverage_15": 0.4346598982810974,
"rewards/batch_coverage_20": 0.4520686209201813,
"rewards/batch_coverage_25": 0.46169546246528625,
"rewards/batch_coverage_5": 0.3437255859375,
"rewards/brier_reward": 0.7455047130584717,
"rewards/confidence_uniqueness_reward": 0.7925597071647644,
"rewards/format_reward": 0.9861111283302307,
"rewards/frontier_aurc_reward": -0.00259375493042171,
"rewards/frontier_ece_reward": 0.03039606139063835,
"rewards/frontier_entropy_batch_reward": -0.49973989129066465,
"signal/accuracy_reward/centered_abs_mean": 0.19058702290058135,
"signal/accuracy_reward/group_std_mean": 0.24706077873706817,
"signal/accuracy_reward/group_zero_std_frac": 0.3222222298383713,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09529351145029068,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09529351145029068,
"signal/advantage_abs_mean": 0.20483049750328064,
"signal/advantage_pre_scale_abs_mean": 0.20483049750328064,
"signal/advantage_pre_scale_std": 0.2846250832080841,
"signal/advantage_std": 0.2846250832080841,
"signal/batch_coverage_0/centered_abs_mean": 0.20671135187149048,
"signal/batch_coverage_0/group_std_mean": 0.2817295253276825,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.02067113555967808,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.02067113555967808,
"signal/batch_coverage_1/centered_abs_mean": 0.20671135187149048,
"signal/batch_coverage_1/group_std_mean": 0.2817295253276825,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.02067113555967808,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.02067113555967808,
"signal/batch_coverage_10/centered_abs_mean": 0.24098095595836638,
"signal/batch_coverage_10/group_std_mean": 0.3243285596370697,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.024098095670342445,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.024098095670342445,
"signal/batch_coverage_15/centered_abs_mean": 0.24538744986057281,
"signal/batch_coverage_15/group_std_mean": 0.3284584999084473,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.02453874610364437,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.02453874610364437,
"signal/batch_coverage_20/centered_abs_mean": 0.25401687026023867,
"signal/batch_coverage_20/group_std_mean": 0.3383590757846832,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02540168762207031,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.02540168762207031,
"signal/batch_coverage_25/centered_abs_mean": 0.26198176145553587,
"signal/batch_coverage_25/group_std_mean": 0.3468038856983185,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.02619817741215229,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.02619817741215229,
"signal/batch_coverage_5/centered_abs_mean": 0.21985240578651427,
"signal/batch_coverage_5/group_std_mean": 0.29721832275390625,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.021985240653157233,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.021985240653157233,
"signal/brier_reward/centered_abs_mean": 0.22155551016330718,
"signal/brier_reward/group_std_mean": 0.2792714238166809,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.022155551612377165,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.022155551612377165,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1554034858942032,
"signal/confidence_uniqueness_reward/group_std_mean": 0.19029141664505006,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.015540349110960961,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.015540349110960961,
"signal/format_reward/centered_abs_mean": 0.02329644076526165,
"signal/format_reward/group_std_mean": 0.0442409373819828,
"signal/format_reward/group_zero_std_frac": 0.8138888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.011648220382630825,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.011648220382630825,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0022925134282559155,
"signal/frontier_aurc_reward/group_std_mean": 0.0032842617481946947,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.865641945390962e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.865641945390962e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.05770089849829674,
"signal/frontier_ece_reward/group_std_mean": 0.07514400482177734,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.005770089849829674,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.005770089849829674,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.41969223618507384,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4983623504638672,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.002777777798473835,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.04196922481060028,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.04196922481060028,
"step": 50
},
{
"epoch": 0.11999850001874976,
"eval_calibration/aurc": 0.18383896034708969,
"eval_calibration/batch_distribution_entropy": 0.6563989942090224,
"eval_calibration/buffer_distribution_entropy": 0.5797339776512932,
"eval_calibration/confidence_entropy": 0.25943922478959713,
"eval_calibration/coverage@0%": 0.17271505376344085,
"eval_calibration/coverage@1%": 0.17271505376344085,
"eval_calibration/coverage@10%": 0.3370295698924732,
"eval_calibration/coverage@15%": 0.478494623655914,
"eval_calibration/coverage@20%": 0.6041666666666666,
"eval_calibration/coverage@25%": 0.7447916666666666,
"eval_calibration/coverage@30%": 0.9635416666666666,
"eval_calibration/coverage@5%": 0.17271505376344085,
"eval_calibration/ece": 0.2373774714833392,
"eval_calibration/mean_confidence": 0.6960221763524769,
"eval_completions/clipped_ratio": 0.009548611111111105,
"eval_completions/max_length": 3419.6666666666665,
"eval_completions/max_terminated_length": 3419.6666666666665,
"eval_completions/mean_length": 820.9784952799479,
"eval_completions/mean_terminated_length": 828.7581787109375,
"eval_completions/min_length": 108.83333333333333,
"eval_completions/min_terminated_length": 306.3333333333333,
"eval_loss": 0.0,
"eval_num_tokens": 97092618.0,
"eval_reward": 0.9636127551396688,
"eval_reward_std": 0.3094961891571681,
"eval_rewards/accuracy_reward": 0.6571180522441864,
"eval_rewards/batch_coverage_0": 0.060765147441998124,
"eval_rewards/batch_coverage_1": 0.060765147441998124,
"eval_rewards/batch_coverage_10": 0.09114414701859157,
"eval_rewards/batch_coverage_15": 0.12853367378314337,
"eval_rewards/batch_coverage_20": 0.16657670897742113,
"eval_rewards/batch_coverage_25": 0.2249002829194069,
"eval_rewards/batch_coverage_5": 0.07248803181573749,
"eval_rewards/brier_reward": 0.7720614771048228,
"eval_rewards/confidence_uniqueness_reward": 0.7892127931118011,
"eval_rewards/format_reward": 0.9869791666666666,
"eval_rewards/frontier_aurc_reward": -0.0020255156753895185,
"eval_rewards/frontier_ece_reward": 0.03642605741818746,
"eval_rewards/frontier_entropy_batch_reward": -0.9869791666666666,
"eval_runtime": 197.803,
"eval_samples_per_second": 5.056,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4374457498391469,
"eval_signal/accuracy_reward/group_std_mean": 0.4739072273174922,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21872287491957346,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21872287491957346,
"eval_signal/advantage_abs_mean": 0.2603100687265396,
"eval_signal/advantage_pre_scale_abs_mean": 0.2603100687265396,
"eval_signal/advantage_pre_scale_std": 0.30781379838784534,
"eval_signal/advantage_std": 0.30781379838784534,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.2784964566429456,
"eval_signal/batch_coverage_0/group_std_mean": 0.42198194066683453,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.02784964597473542,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.02784964597473542,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.2784964566429456,
"eval_signal/batch_coverage_1/group_std_mean": 0.42198194066683453,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.02784964597473542,
"eval_signal/batch_coverage_1/weight": 0.10000000149011612,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.02784964597473542,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.26380258798599243,
"eval_signal/batch_coverage_10/group_std_mean": 0.38416793445746106,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.02638025985409816,
"eval_signal/batch_coverage_10/weight": 0.10000000149011612,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.02638025985409816,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.26254650205373764,
"eval_signal/batch_coverage_15/group_std_mean": 0.3666886240243912,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.026254651757578056,
"eval_signal/batch_coverage_15/weight": 0.10000000149011612,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.026254651757578056,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.2590629483262698,
"eval_signal/batch_coverage_20/group_std_mean": 0.3404406060775121,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.025906294273833435,
"eval_signal/batch_coverage_20/weight": 0.10000000149011612,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.025906294273833435,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.32382526497046155,
"eval_signal/batch_coverage_25/group_std_mean": 0.39501046637694043,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.032382527055839695,
"eval_signal/batch_coverage_25/weight": 0.10000000149011612,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.032382527055839695,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.2789640575647354,
"eval_signal/batch_coverage_5/group_std_mean": 0.41690531373023987,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.02789640674988429,
"eval_signal/batch_coverage_5/weight": 0.10000000149011612,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.02789640674988429,
"eval_signal/brier_reward/centered_abs_mean": 0.2922615110874176,
"eval_signal/brier_reward/group_std_mean": 0.3561810503403346,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.029226152536769707,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.029226152536769707,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.13250999401013056,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.1647503450512886,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.01325099915266037,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.01325099915266037,
"eval_signal/format_reward/centered_abs_mean": 0.02501085043574373,
"eval_signal/format_reward/group_std_mean": 0.06767813830326001,
"eval_signal/format_reward/group_zero_std_frac": 0.6388889104127884,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.012505425217871865,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.012505425217871865,
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.002747349014195303,
"eval_signal/frontier_aurc_reward/group_std_mean": 0.004716326482594013,
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.434186419326579e-05,
"eval_signal/frontier_aurc_reward/weight": 0.012500000186264515,
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.434186419326579e-05,
"eval_signal/frontier_ece_reward/centered_abs_mean": 0.0681057870388031,
"eval_signal/frontier_ece_reward/group_std_mean": 0.08917686839898427,
"eval_signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.006810579060887297,
"eval_signal/frontier_ece_reward/weight": 0.10000000149011612,
"eval_signal/frontier_ece_reward/weighted_centered_abs_mean": 0.006810579060887297,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.02501085043574373,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.06767813830326001,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.6388889104127884,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.00250108518715327,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.00250108518715327,
"eval_steps_per_second": 0.03,
"step": 50
},
{
"calibration/aurc": 0.23900662951911134,
"calibration/batch_distribution_entropy": 0.6892578373116545,
"calibration/buffer_distribution_entropy": 0.5916157765780743,
"calibration/confidence_entropy": 0.2553431192007102,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.1387434554973822,
"calibration/coverage@15%": 0.16125654450261778,
"calibration/coverage@20%": 0.5184954532929181,
"calibration/coverage@25%": 0.677514619883041,
"calibration/coverage@30%": 0.8519459409679054,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.19219343225644983,
"calibration/mean_confidence": 0.7162487190539071,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017013888888888905,
"completions/max_length": 3640.6,
"completions/max_terminated_length": 3640.6,
"completions/mean_length": 801.1100708007813,
"completions/mean_terminated_length": 815.0879760742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.2,
"epoch": 0.13199835002062474,
"grad_norm": 0.0007020493503659964,
"learning_rate": 4.60843373493976e-06,
"loss": -0.0171,
"num_tokens": 109401982.0,
"reward": 1.2057682991027832,
"reward_std": 0.2689946472644806,
"rewards/accuracy_reward": 0.6462673544883728,
"rewards/batch_coverage_0": 0.28498372435569763,
"rewards/batch_coverage_1": 0.28498372435569763,
"rewards/batch_coverage_10": 0.43428103923797606,
"rewards/batch_coverage_15": 0.44705964922904967,
"rewards/batch_coverage_20": 0.4741877973079681,
"rewards/batch_coverage_25": 0.4844040036201477,
"rewards/batch_coverage_5": 0.38492555618286134,
"rewards/brier_reward": 0.760375726222992,
"rewards/confidence_uniqueness_reward": 0.8418051481246949,
"rewards/format_reward": 0.9821180582046509,
"rewards/frontier_aurc_reward": -0.002279853168874979,
"rewards/frontier_ece_reward": 0.03188092932105065,
"rewards/frontier_entropy_batch_reward": -0.5128462076187134,
"signal/accuracy_reward/centered_abs_mean": 0.18359917402267456,
"signal/accuracy_reward/group_std_mean": 0.24245889782905578,
"signal/accuracy_reward/group_zero_std_frac": 0.30555555820465086,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09179958701133728,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09179958701133728,
"signal/advantage_abs_mean": 0.20495306551456452,
"signal/advantage_pre_scale_abs_mean": 0.20495306551456452,
"signal/advantage_pre_scale_std": 0.2879967331886292,
"signal/advantage_std": 0.2879967331886292,
"signal/batch_coverage_0/centered_abs_mean": 0.21537235081195832,
"signal/batch_coverage_0/group_std_mean": 0.2897315204143524,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.021537235751748086,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.021537235751748086,
"signal/batch_coverage_1/centered_abs_mean": 0.21537235081195832,
"signal/batch_coverage_1/group_std_mean": 0.2897315204143524,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.021537235751748086,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.021537235751748086,
"signal/batch_coverage_10/centered_abs_mean": 0.2431473821401596,
"signal/batch_coverage_10/group_std_mean": 0.3251115679740906,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.02431473843753338,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.02431473843753338,
"signal/batch_coverage_15/centered_abs_mean": 0.2471532106399536,
"signal/batch_coverage_15/group_std_mean": 0.3280351758003235,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.024715321511030196,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.024715321511030196,
"signal/batch_coverage_20/centered_abs_mean": 0.26155039072036745,
"signal/batch_coverage_20/group_std_mean": 0.3446295440196991,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02615503966808319,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.02615503966808319,
"signal/batch_coverage_25/centered_abs_mean": 0.26704921424388883,
"signal/batch_coverage_25/group_std_mean": 0.35001180768013,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.026704922690987586,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.026704922690987586,
"signal/batch_coverage_5/centered_abs_mean": 0.23431369364261628,
"signal/batch_coverage_5/group_std_mean": 0.31417744755744936,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.02343137003481388,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.02343137003481388,
"signal/brier_reward/centered_abs_mean": 0.21910977661609649,
"signal/brier_reward/group_std_mean": 0.2725955665111542,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.021910977363586426,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.021910977363586426,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.11237560659646988,
"signal/confidence_uniqueness_reward/group_std_mean": 0.14760001599788666,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.011237560398876667,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.011237560398876667,
"signal/format_reward/centered_abs_mean": 0.02948133647441864,
"signal/format_reward/group_std_mean": 0.055926169455051425,
"signal/format_reward/group_zero_std_frac": 0.7694444417953491,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01474066823720932,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01474066823720932,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0023909094743430614,
"signal/frontier_aurc_reward/group_std_mean": 0.0037250214256346224,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.988636879308615e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.988636879308615e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.055272321403026584,
"signal/frontier_ece_reward/group_std_mean": 0.0705582395195961,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.005527231935411691,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.005527231935411691,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.4059657871723175,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4828205406665802,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.008333333395421505,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.04059657901525497,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.04059657901525497,
"step": 55
},
{
"calibration/aurc": 0.27999538708938243,
"calibration/batch_distribution_entropy": 0.6776643316316128,
"calibration/buffer_distribution_entropy": 0.6025841857243535,
"calibration/confidence_entropy": 0.256365245569243,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.08167539267015707,
"calibration/coverage@15%": 0.34397905759162306,
"calibration/coverage@20%": 0.4451284034867176,
"calibration/coverage@25%": 0.49979416762215256,
"calibration/coverage@30%": 0.5467018469656992,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.19908587966332278,
"calibration/mean_confidence": 0.7527650724916545,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012586805555555558,
"completions/max_length": 3794.0,
"completions/max_terminated_length": 3794.0,
"completions/mean_length": 789.0218017578125,
"completions/mean_terminated_length": 799.2080444335937,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.6,
"epoch": 0.14399820002249972,
"grad_norm": 0.0007612982881255448,
"learning_rate": 4.457831325301205e-06,
"loss": -0.0164,
"num_tokens": 121588089.0,
"reward": 1.2023053884506225,
"reward_std": 0.24645790457725525,
"rewards/accuracy_reward": 0.6163194298744201,
"rewards/batch_coverage_0": 0.30756841897964476,
"rewards/batch_coverage_1": 0.30756841897964476,
"rewards/batch_coverage_10": 0.4529237329959869,
"rewards/batch_coverage_15": 0.4701856553554535,
"rewards/batch_coverage_20": 0.49019448161125184,
"rewards/batch_coverage_25": 0.5011613070964813,
"rewards/batch_coverage_5": 0.3910254955291748,
"rewards/brier_reward": 0.7737071990966797,
"rewards/confidence_uniqueness_reward": 0.8230122208595276,
"rewards/format_reward": 0.9872395873069764,
"rewards/frontier_aurc_reward": -0.0027119277510792017,
"rewards/frontier_ece_reward": 0.03474101237952709,
"rewards/frontier_entropy_batch_reward": -0.5464899241924286,
"signal/accuracy_reward/centered_abs_mean": 0.1891059011220932,
"signal/accuracy_reward/group_std_mean": 0.2480522572994232,
"signal/accuracy_reward/group_zero_std_frac": 0.3027777820825577,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0945529505610466,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0945529505610466,
"signal/advantage_abs_mean": 0.18332133889198304,
"signal/advantage_pre_scale_abs_mean": 0.18332133889198304,
"signal/advantage_pre_scale_std": 0.2718550503253937,
"signal/advantage_std": 0.2718550503253937,
"signal/batch_coverage_0/centered_abs_mean": 0.17082740664482116,
"signal/batch_coverage_0/group_std_mean": 0.2339889734983444,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.017082741484045982,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017082741484045982,
"signal/batch_coverage_1/centered_abs_mean": 0.17082740664482116,
"signal/batch_coverage_1/group_std_mean": 0.2339889734983444,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.017082741484045982,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.017082741484045982,
"signal/batch_coverage_10/centered_abs_mean": 0.20480410158634185,
"signal/batch_coverage_10/group_std_mean": 0.2818973779678345,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.020480410754680635,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.020480410754680635,
"signal/batch_coverage_15/centered_abs_mean": 0.21194995045661927,
"signal/batch_coverage_15/group_std_mean": 0.29027042388916013,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.02119499444961548,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.02119499444961548,
"signal/batch_coverage_20/centered_abs_mean": 0.2243577092885971,
"signal/batch_coverage_20/group_std_mean": 0.30434967279434205,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02243577167391777,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.02243577167391777,
"signal/batch_coverage_25/centered_abs_mean": 0.2355395793914795,
"signal/batch_coverage_25/group_std_mean": 0.3164583444595337,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.023553960025310516,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.023553960025310516,
"signal/batch_coverage_5/centered_abs_mean": 0.18540128767490388,
"signal/batch_coverage_5/group_std_mean": 0.25556024312973025,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01854012943804264,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01854012943804264,
"signal/brier_reward/centered_abs_mean": 0.18852369487285614,
"signal/brier_reward/group_std_mean": 0.24252268970012664,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.018852369859814644,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.018852369859814644,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1297829270362854,
"signal/confidence_uniqueness_reward/group_std_mean": 0.16556560397148132,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.012978293374180794,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.012978293374180794,
"signal/format_reward/centered_abs_mean": 0.022477213479578494,
"signal/format_reward/group_std_mean": 0.04559006839990616,
"signal/format_reward/group_zero_std_frac": 0.800000011920929,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.011238606739789247,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.011238606739789247,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0027325002010911703,
"signal/frontier_aurc_reward/group_std_mean": 0.004154342599213124,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.415625178604387e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.415625178604387e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.047476866841316225,
"signal/frontier_ece_reward/group_std_mean": 0.05966043248772621,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.004747686814516783,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.004747686814516783,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.37659157514572145,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.45627657175064085,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.016666666977107523,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.03765915706753731,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03765915706753731,
"step": 60
},
{
"calibration/aurc": 0.19163294910122897,
"calibration/batch_distribution_entropy": 0.7544690242665435,
"calibration/buffer_distribution_entropy": 0.6190086662203866,
"calibration/confidence_entropy": 0.27602314148975093,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.05639686684073107,
"calibration/coverage@10%": 0.2681954260249067,
"calibration/coverage@15%": 0.5397189208834419,
"calibration/coverage@20%": 0.6283629626508924,
"calibration/coverage@25%": 0.6912652276881658,
"calibration/coverage@30%": 0.8117131383051183,
"calibration/coverage@5%": 0.1404699738903394,
"calibration/ece": 0.117494904221392,
"calibration/mean_confidence": 0.6757113075851048,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013107638888888884,
"completions/max_length": 3759.8,
"completions/max_terminated_length": 3759.8,
"completions/mean_length": 784.5288208007812,
"completions/mean_terminated_length": 794.96884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.1559980500243747,
"grad_norm": 0.0006685956032015383,
"learning_rate": 4.307228915662651e-06,
"loss": -0.0141,
"num_tokens": 133719909.0,
"reward": 1.2486091375350952,
"reward_std": 0.2386137217283249,
"rewards/accuracy_reward": 0.6405381917953491,
"rewards/batch_coverage_0": 0.35705705881118777,
"rewards/batch_coverage_1": 0.35705705881118777,
"rewards/batch_coverage_10": 0.47446699142456056,
"rewards/batch_coverage_15": 0.49599921703338623,
"rewards/batch_coverage_20": 0.5140182256698609,
"rewards/batch_coverage_25": 0.5188945710659028,
"rewards/batch_coverage_5": 0.41983569860458375,
"rewards/brier_reward": 0.8078495979309082,
"rewards/confidence_uniqueness_reward": 0.8686538100242615,
"rewards/format_reward": 0.9868923544883728,
"rewards/frontier_aurc_reward": -0.0016949245939031244,
"rewards/frontier_ece_reward": 0.03309335187077522,
"rewards/frontier_entropy_batch_reward": -0.4977757096290588,
"signal/accuracy_reward/centered_abs_mean": 0.172509765625,
"signal/accuracy_reward/group_std_mean": 0.23307301700115204,
"signal/accuracy_reward/group_zero_std_frac": 0.3222222328186035,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0862548828125,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0862548828125,
"signal/advantage_abs_mean": 0.17538017630577088,
"signal/advantage_pre_scale_abs_mean": 0.17538017630577088,
"signal/advantage_pre_scale_std": 0.2610613316297531,
"signal/advantage_std": 0.2610613316297531,
"signal/batch_coverage_0/centered_abs_mean": 0.17853089570999145,
"signal/batch_coverage_0/group_std_mean": 0.2395469069480896,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.017853090167045595,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017853090167045595,
"signal/batch_coverage_1/centered_abs_mean": 0.17853089570999145,
"signal/batch_coverage_1/group_std_mean": 0.2395469069480896,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.017853090167045595,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.017853090167045595,
"signal/batch_coverage_10/centered_abs_mean": 0.20888802111148835,
"signal/batch_coverage_10/group_std_mean": 0.27947876453399656,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.020888802036643027,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.020888802036643027,
"signal/batch_coverage_15/centered_abs_mean": 0.21613354980945587,
"signal/batch_coverage_15/group_std_mean": 0.28842523097991946,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.021613356098532675,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.021613356098532675,
"signal/batch_coverage_20/centered_abs_mean": 0.22640545964241027,
"signal/batch_coverage_20/group_std_mean": 0.3013183057308197,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.022640545666217805,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.022640545666217805,
"signal/batch_coverage_25/centered_abs_mean": 0.22826351821422577,
"signal/batch_coverage_25/group_std_mean": 0.30373467206954957,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.022826352342963218,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.022826352342963218,
"signal/batch_coverage_5/centered_abs_mean": 0.19246315062046052,
"signal/batch_coverage_5/group_std_mean": 0.25800671279430387,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.019246315211057664,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.019246315211057664,
"signal/brier_reward/centered_abs_mean": 0.17271728515625,
"signal/brier_reward/group_std_mean": 0.22596350610256194,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.017271729186177254,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.017271729186177254,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.10451890975236892,
"signal/confidence_uniqueness_reward/group_std_mean": 0.1398794487118721,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.010451891273260117,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.010451891273260117,
"signal/format_reward/centered_abs_mean": 0.02364908903837204,
"signal/format_reward/group_std_mean": 0.04641622006893158,
"signal/format_reward/group_zero_std_frac": 0.8083333373069763,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.01182454451918602,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.01182454451918602,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0019812505692243577,
"signal/frontier_aurc_reward/group_std_mean": 0.0031359643675386907,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.4765633133938537e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.4765633133938537e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.039264075458049774,
"signal/frontier_ece_reward/group_std_mean": 0.05073797106742859,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0039264077320694925,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0039264077320694925,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.36650044918060304,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.44260573387145996,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.013888888992369175,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.036650046706199646,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.036650046706199646,
"step": 65
},
{
"calibration/aurc": 0.23999176408337725,
"calibration/batch_distribution_entropy": 0.6836312918585741,
"calibration/buffer_distribution_entropy": 0.6338147504036498,
"calibration/confidence_entropy": 0.24961262377196078,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.12295514511873351,
"calibration/coverage@15%": 0.2390515169772644,
"calibration/coverage@20%": 0.31742437347393787,
"calibration/coverage@25%": 0.44004092461640465,
"calibration/coverage@30%": 0.793044473258377,
"calibration/coverage@5%": 0.0891820580474934,
"calibration/ece": 0.17061860165493908,
"calibration/mean_confidence": 0.7117827796613386,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01675347222222221,
"completions/max_length": 3543.4,
"completions/max_terminated_length": 3543.4,
"completions/mean_length": 777.2517456054687,
"completions/mean_terminated_length": 790.4873779296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.4,
"epoch": 0.16799790002624967,
"grad_norm": 0.0006346319569274783,
"learning_rate": 4.156626506024097e-06,
"loss": -0.0198,
"num_tokens": 145751993.0,
"reward": 1.2204406261444092,
"reward_std": 0.23513408601284028,
"rewards/accuracy_reward": 0.6188367962837219,
"rewards/batch_coverage_0": 0.34071238040924073,
"rewards/batch_coverage_1": 0.34071238040924073,
"rewards/batch_coverage_10": 0.4735415160655975,
"rewards/batch_coverage_15": 0.4864673376083374,
"rewards/batch_coverage_20": 0.4995663225650787,
"rewards/batch_coverage_25": 0.5093587577342987,
"rewards/batch_coverage_5": 0.41144366264343263,
"rewards/brier_reward": 0.7896717667579651,
"rewards/confidence_uniqueness_reward": 0.8611388564109802,
"rewards/format_reward": 0.9831597208976746,
"rewards/frontier_aurc_reward": -0.0018688955111429094,
"rewards/frontier_ece_reward": 0.030099016055464743,
"rewards/frontier_entropy_batch_reward": -0.5480550169944763,
"signal/accuracy_reward/centered_abs_mean": 0.176611328125,
"signal/accuracy_reward/group_std_mean": 0.23280532658100128,
"signal/accuracy_reward/group_zero_std_frac": 0.3388888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0883056640625,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0883056640625,
"signal/advantage_abs_mean": 0.17352637350559236,
"signal/advantage_pre_scale_abs_mean": 0.17352637350559236,
"signal/advantage_pre_scale_std": 0.26246364414691925,
"signal/advantage_std": 0.26246364414691925,
"signal/batch_coverage_0/centered_abs_mean": 0.17251457870006562,
"signal/batch_coverage_0/group_std_mean": 0.2311745971441269,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.017251458019018173,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.017251458019018173,
"signal/batch_coverage_1/centered_abs_mean": 0.17251457870006562,
"signal/batch_coverage_1/group_std_mean": 0.2311745971441269,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.017251458019018173,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.017251458019018173,
"signal/batch_coverage_10/centered_abs_mean": 0.204291170835495,
"signal/batch_coverage_10/group_std_mean": 0.27622554302215574,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.02042911797761917,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.02042911797761917,
"signal/batch_coverage_15/centered_abs_mean": 0.2088209420442581,
"signal/batch_coverage_15/group_std_mean": 0.28075531125068665,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.020882094651460646,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.020882094651460646,
"signal/batch_coverage_20/centered_abs_mean": 0.2121003121137619,
"signal/batch_coverage_20/group_std_mean": 0.2855076462030411,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02121003195643425,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.02121003195643425,
"signal/batch_coverage_25/centered_abs_mean": 0.21743012368679046,
"signal/batch_coverage_25/group_std_mean": 0.2922160685062408,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.021743013337254525,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.021743013337254525,
"signal/batch_coverage_5/centered_abs_mean": 0.18528588414192199,
"signal/batch_coverage_5/group_std_mean": 0.24809181988239287,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.018528588488698004,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.018528588488698004,
"signal/brier_reward/centered_abs_mean": 0.17164082825183868,
"signal/brier_reward/group_std_mean": 0.2222065269947052,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.017164083570241927,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.017164083570241927,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1031325027346611,
"signal/confidence_uniqueness_reward/group_std_mean": 0.13971158862113953,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.010313250403851271,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.010313250403851271,
"signal/format_reward/centered_abs_mean": 0.02896050326526165,
"signal/format_reward/group_std_mean": 0.05549793913960457,
"signal/format_reward/group_zero_std_frac": 0.7694444417953491,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.014480251632630826,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.014480251632630826,
"signal/frontier_aurc_reward/centered_abs_mean": 0.001962455874308944,
"signal/frontier_aurc_reward/group_std_mean": 0.0030693566892296075,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.4530698647140524e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.4530698647140524e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.035511156916618346,
"signal/frontier_ece_reward/group_std_mean": 0.04602086395025253,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0035511157941073178,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0035511157941073178,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3593166649341583,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.433760666847229,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.038888888992369176,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.035931666195392606,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.035931666195392606,
"step": 70
},
{
"calibration/aurc": 0.17997203334443818,
"calibration/batch_distribution_entropy": 0.5623722645304415,
"calibration/buffer_distribution_entropy": 0.6359015524760088,
"calibration/confidence_entropy": 0.2136270373288652,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.3683673011495868,
"calibration/coverage@15%": 0.5345634641198306,
"calibration/coverage@20%": 0.6111288934411654,
"calibration/coverage@25%": 0.6967545723451235,
"calibration/coverage@30%": 0.7372661750614506,
"calibration/coverage@5%": 0.0,
"calibration/ece": 0.15312949408298948,
"calibration/mean_confidence": 0.7853518189875418,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015451388888888862,
"completions/max_length": 3386.4,
"completions/max_terminated_length": 3386.4,
"completions/mean_length": 780.0697998046875,
"completions/mean_terminated_length": 792.3125244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 246.4,
"epoch": 0.17999775002812465,
"grad_norm": 0.0005761557258665562,
"learning_rate": 4.006024096385543e-06,
"loss": -0.0137,
"num_tokens": 157803293.0,
"reward": 1.274224543571472,
"reward_std": 0.2257396847009659,
"rewards/accuracy_reward": 0.6721354246139526,
"rewards/batch_coverage_0": 0.3722359657287598,
"rewards/batch_coverage_1": 0.3722359657287598,
"rewards/batch_coverage_10": 0.5174551129341125,
"rewards/batch_coverage_15": 0.5403218030929565,
"rewards/batch_coverage_20": 0.5625471234321594,
"rewards/batch_coverage_25": 0.5715397596359253,
"rewards/batch_coverage_5": 0.4612420558929443,
"rewards/brier_reward": 0.8187686681747437,
"rewards/confidence_uniqueness_reward": 0.8240198254585266,
"rewards/format_reward": 0.9844618082046509,
"rewards/frontier_aurc_reward": -0.0016686252551153303,
"rewards/frontier_ece_reward": 0.03394242897629738,
"rewards/frontier_entropy_batch_reward": -0.6148412227630615,
"signal/accuracy_reward/centered_abs_mean": 0.17437608540058136,
"signal/accuracy_reward/group_std_mean": 0.23193702697753907,
"signal/accuracy_reward/group_zero_std_frac": 0.33333333730697634,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08718804270029068,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08718804270029068,
"signal/advantage_abs_mean": 0.16320714354515076,
"signal/advantage_pre_scale_abs_mean": 0.16320714354515076,
"signal/advantage_pre_scale_std": 0.2634482055902481,
"signal/advantage_std": 0.2634482055902481,
"signal/batch_coverage_0/centered_abs_mean": 0.13958706557750702,
"signal/batch_coverage_0/group_std_mean": 0.19350030422210693,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.013958707079291343,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013958707079291343,
"signal/batch_coverage_1/centered_abs_mean": 0.13958706557750702,
"signal/batch_coverage_1/group_std_mean": 0.19350030422210693,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.013958707079291343,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.013958707079291343,
"signal/batch_coverage_10/centered_abs_mean": 0.17092832624912263,
"signal/batch_coverage_10/group_std_mean": 0.24247340261936187,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.017092833295464516,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.017092833295464516,
"signal/batch_coverage_15/centered_abs_mean": 0.18111636042594909,
"signal/batch_coverage_15/group_std_mean": 0.25312560200691225,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.018111636117100715,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.018111636117100715,
"signal/batch_coverage_20/centered_abs_mean": 0.1944339394569397,
"signal/batch_coverage_20/group_std_mean": 0.26790787279605865,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.019443394616246223,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.019443394616246223,
"signal/batch_coverage_25/centered_abs_mean": 0.203565114736557,
"signal/batch_coverage_25/group_std_mean": 0.27769856452941893,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.02035651244223118,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.02035651244223118,
"signal/batch_coverage_5/centered_abs_mean": 0.1548121303319931,
"signal/batch_coverage_5/group_std_mean": 0.2177237719297409,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01548121329396963,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01548121329396963,
"signal/brier_reward/centered_abs_mean": 0.1549970895051956,
"signal/brier_reward/group_std_mean": 0.20645565688610076,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01549970917403698,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01549970917403698,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.10375804156064987,
"signal/confidence_uniqueness_reward/group_std_mean": 0.13831095546483993,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.01037580445408821,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.01037580445408821,
"signal/format_reward/centered_abs_mean": 0.02744683139026165,
"signal/format_reward/group_std_mean": 0.0541893906891346,
"signal/format_reward/group_zero_std_frac": 0.7694444537162781,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.013723415695130825,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.013723415695130825,
"signal/frontier_aurc_reward/centered_abs_mean": 0.001943652448244393,
"signal/frontier_aurc_reward/group_std_mean": 0.0031304992735385895,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.4295656112371943e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.4295656112371943e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.03213078454136849,
"signal/frontier_ece_reward/group_std_mean": 0.042379957437515256,
"signal/frontier_ece_reward/group_zero_std_frac": 0.002777777798473835,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0032130784820765257,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0032130784820765257,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.29642507433891296,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.37213156223297117,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.13055555820465087,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.029642507433891296,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.029642507433891296,
"step": 75
},
{
"calibration/aurc": 0.18531783499401064,
"calibration/batch_distribution_entropy": 0.5264748528796649,
"calibration/buffer_distribution_entropy": 0.6301669153476362,
"calibration/confidence_entropy": 0.20656610665663813,
"calibration/coverage@0%": 0.018947368421052633,
"calibration/coverage@1%": 0.018947368421052633,
"calibration/coverage@10%": 0.30307596167198997,
"calibration/coverage@15%": 0.5106729483104138,
"calibration/coverage@20%": 0.5678850159700042,
"calibration/coverage@25%": 0.7134553775743708,
"calibration/coverage@30%": 0.9037159420289855,
"calibration/coverage@5%": 0.2714622969032079,
"calibration/ece": 0.1599405047177906,
"calibration/mean_confidence": 0.7885415614674296,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.014670138888888906,
"completions/max_length": 3726.8,
"completions/max_terminated_length": 3726.8,
"completions/mean_length": 823.2955688476562,
"completions/mean_terminated_length": 835.6906127929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.19199760002999963,
"grad_norm": 0.0006295799976214767,
"learning_rate": 3.855421686746989e-06,
"loss": -0.0148,
"num_tokens": 170340938.0,
"reward": 1.2308602809906006,
"reward_std": 0.23434819877147675,
"rewards/accuracy_reward": 0.6280381858348847,
"rewards/batch_coverage_0": 0.35562955141067504,
"rewards/batch_coverage_1": 0.35562955141067504,
"rewards/batch_coverage_10": 0.4737295091152191,
"rewards/batch_coverage_15": 0.4910382151603699,
"rewards/batch_coverage_20": 0.5097373247146606,
"rewards/batch_coverage_25": 0.5154181003570557,
"rewards/batch_coverage_5": 0.42273095846176145,
"rewards/brier_reward": 0.8010688662528992,
"rewards/confidence_uniqueness_reward": 0.860322093963623,
"rewards/format_reward": 0.98515625,
"rewards/frontier_aurc_reward": -0.001912536984309554,
"rewards/frontier_ece_reward": 0.027306798845529556,
"rewards/frontier_entropy_batch_reward": -0.569741952419281,
"signal/accuracy_reward/centered_abs_mean": 0.18927408754825592,
"signal/accuracy_reward/group_std_mean": 0.24705808460712433,
"signal/accuracy_reward/group_zero_std_frac": 0.30555555820465086,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09463704377412796,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.09463704377412796,
"signal/advantage_abs_mean": 0.17279953360557557,
"signal/advantage_pre_scale_abs_mean": 0.17279953360557557,
"signal/advantage_pre_scale_std": 0.2633059620857239,
"signal/advantage_std": 0.2633059620857239,
"signal/batch_coverage_0/centered_abs_mean": 0.16080776751041412,
"signal/batch_coverage_0/group_std_mean": 0.21778478026390075,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.016080777533352374,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.016080777533352374,
"signal/batch_coverage_1/centered_abs_mean": 0.16080776751041412,
"signal/batch_coverage_1/group_std_mean": 0.21778478026390075,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.016080777533352374,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.016080777533352374,
"signal/batch_coverage_10/centered_abs_mean": 0.1845082998275757,
"signal/batch_coverage_10/group_std_mean": 0.2537638247013092,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01845082901418209,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01845082901418209,
"signal/batch_coverage_15/centered_abs_mean": 0.19259609282016754,
"signal/batch_coverage_15/group_std_mean": 0.2654243648052216,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.019259610027074815,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.019259610027074815,
"signal/batch_coverage_20/centered_abs_mean": 0.2040830820798874,
"signal/batch_coverage_20/group_std_mean": 0.2802187502384186,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.020408308133482933,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.020408308133482933,
"signal/batch_coverage_25/centered_abs_mean": 0.21111299395561217,
"signal/batch_coverage_25/group_std_mean": 0.28788386583328246,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.021111299842596055,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.021111299842596055,
"signal/batch_coverage_5/centered_abs_mean": 0.17277827262878417,
"signal/batch_coverage_5/group_std_mean": 0.2342999130487442,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.017277827486395836,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.017277827486395836,
"signal/brier_reward/centered_abs_mean": 0.16930352449417113,
"signal/brier_reward/group_std_mean": 0.22312399744987488,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.016930353082716464,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.016930353082716464,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08183257579803467,
"signal/confidence_uniqueness_reward/group_std_mean": 0.1107841432094574,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008183257654309273,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008183257654309273,
"signal/format_reward/centered_abs_mean": 0.023681640066206454,
"signal/format_reward/group_std_mean": 0.04482636339962483,
"signal/format_reward/group_zero_std_frac": 0.8138888835906982,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.011840820033103227,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.011840820033103227,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0017274004174396395,
"signal/frontier_aurc_reward/group_std_mean": 0.002619408327154815,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.159250379918376e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.159250379918376e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.030476462468504904,
"signal/frontier_ece_reward/group_std_mean": 0.040614823997020724,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0030476462095975878,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0030476462095975878,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3353477716445923,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4120087206363678,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.09444444589316844,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.03353477939963341,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03353477939963341,
"step": 80
},
{
"calibration/aurc": 0.15409807022690386,
"calibration/batch_distribution_entropy": 0.7416203857286623,
"calibration/buffer_distribution_entropy": 0.6372012889686948,
"calibration/confidence_entropy": 0.27422233398342266,
"calibration/coverage@0%": 0.0109375,
"calibration/coverage@1%": 0.0109375,
"calibration/coverage@10%": 0.4266453998978691,
"calibration/coverage@15%": 0.6234678790038453,
"calibration/coverage@20%": 0.6907730186501071,
"calibration/coverage@25%": 0.7665074893636747,
"calibration/coverage@30%": 0.8581187629948495,
"calibration/coverage@5%": 0.24278833868530353,
"calibration/ece": 0.11338426350473374,
"calibration/mean_confidence": 0.6267695244515951,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012760416666666653,
"completions/max_length": 3565.4,
"completions/max_terminated_length": 3565.4,
"completions/mean_length": 823.7012084960937,
"completions/mean_terminated_length": 834.3744995117188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 272.2,
"epoch": 0.2039974500318746,
"grad_norm": 0.0005630561499856412,
"learning_rate": 3.7048192771084342e-06,
"loss": -0.012,
"num_tokens": 182917176.0,
"reward": 1.276980185508728,
"reward_std": 0.22686235904693602,
"rewards/accuracy_reward": 0.6433159708976746,
"rewards/batch_coverage_0": 0.38253883719444276,
"rewards/batch_coverage_1": 0.38253883719444276,
"rewards/batch_coverage_10": 0.5324840545654297,
"rewards/batch_coverage_15": 0.548856544494629,
"rewards/batch_coverage_20": 0.5570691585540771,
"rewards/batch_coverage_25": 0.5611120223999023,
"rewards/batch_coverage_5": 0.4730106830596924,
"rewards/brier_reward": 0.8340127110481262,
"rewards/confidence_uniqueness_reward": 0.8759592175483704,
"rewards/format_reward": 0.9869791746139527,
"rewards/frontier_aurc_reward": -0.0015283518703654408,
"rewards/frontier_ece_reward": 0.027334221825003623,
"rewards/frontier_entropy_batch_reward": -0.556399130821228,
"signal/accuracy_reward/centered_abs_mean": 0.17159830927848815,
"signal/accuracy_reward/group_std_mean": 0.22915640473365784,
"signal/accuracy_reward/group_zero_std_frac": 0.3388888895511627,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08579915463924408,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08579915463924408,
"signal/advantage_abs_mean": 0.1645752727985382,
"signal/advantage_pre_scale_abs_mean": 0.1645752727985382,
"signal/advantage_pre_scale_std": 0.2576048791408539,
"signal/advantage_std": 0.2576048791408539,
"signal/batch_coverage_0/centered_abs_mean": 0.16249439418315886,
"signal/batch_coverage_0/group_std_mean": 0.21981814205646516,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.016249440237879755,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.016249440237879755,
"signal/batch_coverage_1/centered_abs_mean": 0.16249439418315886,
"signal/batch_coverage_1/group_std_mean": 0.21981814205646516,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.016249440237879755,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.016249440237879755,
"signal/batch_coverage_10/centered_abs_mean": 0.1962430626153946,
"signal/batch_coverage_10/group_std_mean": 0.2668556869029999,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01962430663406849,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01962430663406849,
"signal/batch_coverage_15/centered_abs_mean": 0.20319909751415252,
"signal/batch_coverage_15/group_std_mean": 0.2750951647758484,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.02031990997493267,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.02031990997493267,
"signal/batch_coverage_20/centered_abs_mean": 0.20662610232830048,
"signal/batch_coverage_20/group_std_mean": 0.27981886863708494,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02066261097788811,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.02066261097788811,
"signal/batch_coverage_25/centered_abs_mean": 0.20946857929229737,
"signal/batch_coverage_25/group_std_mean": 0.2836865186691284,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.020946857705712318,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.020946857705712318,
"signal/batch_coverage_5/centered_abs_mean": 0.18017423450946807,
"signal/batch_coverage_5/group_std_mean": 0.24363900423049928,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.018017425015568734,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.018017425015568734,
"signal/brier_reward/centered_abs_mean": 0.1504698157310486,
"signal/brier_reward/group_std_mean": 0.20289478003978728,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01504698134958744,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01504698134958744,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07075797319412232,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10061978846788407,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.007075797766447067,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007075797766447067,
"signal/format_reward/centered_abs_mean": 0.02328558973968029,
"signal/format_reward/group_std_mean": 0.047786331921815875,
"signal/format_reward/group_zero_std_frac": 0.7888889074325561,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.011642794869840146,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.011642794869840146,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0015381898963823914,
"signal/frontier_aurc_reward/group_std_mean": 0.002280404232442379,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.922737301356392e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.922737301356392e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.028247393295168878,
"signal/frontier_ece_reward/group_std_mean": 0.03832782506942749,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0028247392736375334,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0028247392736375334,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.330600780248642,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.41186041831970216,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.06388889066874981,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.03306007869541645,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03306007869541645,
"step": 85
},
{
"calibration/aurc": 0.1409074712236842,
"calibration/batch_distribution_entropy": 0.676989785966493,
"calibration/buffer_distribution_entropy": 0.6450539204074888,
"calibration/confidence_entropy": 0.24289489628080269,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.4740362776214056,
"calibration/coverage@15%": 0.6774752795708263,
"calibration/coverage@20%": 0.7559175357069725,
"calibration/coverage@25%": 0.8221792826805896,
"calibration/coverage@30%": 0.8984341667054151,
"calibration/coverage@5%": 0.3210218467549785,
"calibration/ece": 0.11206175796901677,
"calibration/mean_confidence": 0.703699617406267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00980902777777779,
"completions/max_length": 3131.6,
"completions/max_terminated_length": 3131.6,
"completions/mean_length": 805.4808349609375,
"completions/mean_terminated_length": 813.4924682617187,
"completions/min_length": 0.0,
"completions/min_terminated_length": 232.4,
"epoch": 0.2159973000337496,
"grad_norm": 0.0006051440723240376,
"learning_rate": 3.5542168674698798e-06,
"loss": -0.0106,
"num_tokens": 195264987.0,
"reward": 1.2629518032073974,
"reward_std": 0.21908413171768187,
"rewards/accuracy_reward": 0.6404513835906982,
"rewards/batch_coverage_0": 0.37480852007865906,
"rewards/batch_coverage_1": 0.37480852007865906,
"rewards/batch_coverage_10": 0.5070814311504364,
"rewards/batch_coverage_15": 0.5261907279491425,
"rewards/batch_coverage_20": 0.5442143380641937,
"rewards/batch_coverage_25": 0.5515429675579071,
"rewards/batch_coverage_5": 0.45254968404769896,
"rewards/brier_reward": 0.8282647132873535,
"rewards/confidence_uniqueness_reward": 0.8380447387695312,
"rewards/format_reward": 0.9901041746139526,
"rewards/frontier_aurc_reward": -0.0015266900416463613,
"rewards/frontier_ece_reward": 0.027132243290543555,
"rewards/frontier_entropy_batch_reward": -0.547706913948059,
"signal/accuracy_reward/centered_abs_mean": 0.17931857705116272,
"signal/accuracy_reward/group_std_mean": 0.2344184249639511,
"signal/accuracy_reward/group_zero_std_frac": 0.33611112236976626,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08965928852558136,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08965928852558136,
"signal/advantage_abs_mean": 0.15935106277465821,
"signal/advantage_pre_scale_abs_mean": 0.15935106277465821,
"signal/advantage_pre_scale_std": 0.2506345182657242,
"signal/advantage_std": 0.2506345182657242,
"signal/batch_coverage_0/centered_abs_mean": 0.15178050100803375,
"signal/batch_coverage_0/group_std_mean": 0.20413313806056976,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.015178050845861435,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.015178050845861435,
"signal/batch_coverage_1/centered_abs_mean": 0.15178050100803375,
"signal/batch_coverage_1/group_std_mean": 0.20413313806056976,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.015178050845861435,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.015178050845861435,
"signal/batch_coverage_10/centered_abs_mean": 0.1780288428068161,
"signal/batch_coverage_10/group_std_mean": 0.24395250976085664,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.017802884429693223,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.017802884429693223,
"signal/batch_coverage_15/centered_abs_mean": 0.1861760824918747,
"signal/batch_coverage_15/group_std_mean": 0.2549364984035492,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01861760877072811,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01861760877072811,
"signal/batch_coverage_20/centered_abs_mean": 0.1985442578792572,
"signal/batch_coverage_20/group_std_mean": 0.26970677971839907,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.019854425638914108,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.019854425638914108,
"signal/batch_coverage_25/centered_abs_mean": 0.20626583695411682,
"signal/batch_coverage_25/group_std_mean": 0.2787212669849396,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.020626583695411684,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.020626583695411684,
"signal/batch_coverage_5/centered_abs_mean": 0.16467028558254243,
"signal/batch_coverage_5/group_std_mean": 0.22229004502296448,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.016467029228806496,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.016467029228806496,
"signal/brier_reward/centered_abs_mean": 0.1509499132633209,
"signal/brier_reward/group_std_mean": 0.20125450193881989,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.015094991773366928,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.015094991773366928,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.09308070093393325,
"signal/confidence_uniqueness_reward/group_std_mean": 0.12461533695459366,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.009308070316910744,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.009308070316910744,
"signal/format_reward/centered_abs_mean": 0.018164062313735484,
"signal/format_reward/group_std_mean": 0.04039783589541912,
"signal/format_reward/group_zero_std_frac": 0.8111111283302307,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.009082031156867742,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.009082031156867742,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0018176564015448093,
"signal/frontier_aurc_reward/group_std_mean": 0.0029703597305342556,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.2720704691892024e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.2720704691892024e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.026793276891112328,
"signal/frontier_ece_reward/group_std_mean": 0.0369697593152523,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0026793277356773615,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0026793277356773615,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.31506091356277466,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.39477880001068116,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.08055555745959282,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.031506090238690374,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.031506090238690374,
"step": 90
},
{
"calibration/aurc": 0.18347185980137212,
"calibration/batch_distribution_entropy": 0.6614250966982602,
"calibration/buffer_distribution_entropy": 0.6505126033972186,
"calibration/confidence_entropy": 0.21783652138555804,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.5036943404007126,
"calibration/coverage@15%": 0.5772734599629088,
"calibration/coverage@20%": 0.6441130233893346,
"calibration/coverage@25%": 0.6854127650267248,
"calibration/coverage@30%": 0.7084193402867962,
"calibration/coverage@5%": 0.30512077426763085,
"calibration/ece": 0.14312376310403513,
"calibration/mean_confidence": 0.6829416717300276,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006336805555555558,
"completions/max_length": 3176.8,
"completions/max_terminated_length": 3176.8,
"completions/mean_length": 819.3296997070313,
"completions/mean_terminated_length": 824.5814819335938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.8,
"epoch": 0.22799715003562457,
"grad_norm": 0.0005972707294858992,
"learning_rate": 3.4036144578313257e-06,
"loss": -0.006,
"num_tokens": 207795345.0,
"reward": 1.2796026945114136,
"reward_std": 0.21020272970199586,
"rewards/accuracy_reward": 0.6392361164093018,
"rewards/batch_coverage_0": 0.4010293483734131,
"rewards/batch_coverage_1": 0.4010293483734131,
"rewards/batch_coverage_10": 0.5425745368003845,
"rewards/batch_coverage_15": 0.5669065713882446,
"rewards/batch_coverage_20": 0.575879442691803,
"rewards/batch_coverage_25": 0.5806684374809266,
"rewards/batch_coverage_5": 0.47908039689064025,
"rewards/brier_reward": 0.8297713875770569,
"rewards/confidence_uniqueness_reward": 0.7985679268836975,
"rewards/format_reward": 0.9936631917953491,
"rewards/frontier_aurc_reward": -0.001917445962317288,
"rewards/frontier_ece_reward": 0.026556022092700006,
"rewards/frontier_entropy_batch_reward": -0.5702940583229065,
"signal/accuracy_reward/centered_abs_mean": 0.16852213442325592,
"signal/accuracy_reward/group_std_mean": 0.21984477639198302,
"signal/accuracy_reward/group_zero_std_frac": 0.3833333432674408,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08426106721162796,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08426106721162796,
"signal/advantage_abs_mean": 0.15105001330375672,
"signal/advantage_pre_scale_abs_mean": 0.15105001330375672,
"signal/advantage_pre_scale_std": 0.2421530544757843,
"signal/advantage_std": 0.2421530544757843,
"signal/batch_coverage_0/centered_abs_mean": 0.1473347067832947,
"signal/batch_coverage_0/group_std_mean": 0.2022853225469589,
"signal/batch_coverage_0/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.014733470790088177,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.014733470790088177,
"signal/batch_coverage_1/centered_abs_mean": 0.1473347067832947,
"signal/batch_coverage_1/group_std_mean": 0.2022853225469589,
"signal/batch_coverage_1/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.014733470790088177,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.014733470790088177,
"signal/batch_coverage_10/centered_abs_mean": 0.17430627048015596,
"signal/batch_coverage_10/group_std_mean": 0.24450170993804932,
"signal/batch_coverage_10/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01743062734603882,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01743062734603882,
"signal/batch_coverage_15/centered_abs_mean": 0.18629721999168397,
"signal/batch_coverage_15/group_std_mean": 0.2586580038070679,
"signal/batch_coverage_15/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01862972229719162,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01862972229719162,
"signal/batch_coverage_20/centered_abs_mean": 0.19263581037521363,
"signal/batch_coverage_20/group_std_mean": 0.26611111760139466,
"signal/batch_coverage_20/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.019263581931591035,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.019263581931591035,
"signal/batch_coverage_25/centered_abs_mean": 0.19736669659614564,
"signal/batch_coverage_25/group_std_mean": 0.2709603726863861,
"signal/batch_coverage_25/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.019736670702695847,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.019736670702695847,
"signal/batch_coverage_5/centered_abs_mean": 0.15937765538692475,
"signal/batch_coverage_5/group_std_mean": 0.2213836669921875,
"signal/batch_coverage_5/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.015937766060233116,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.015937766060233116,
"signal/brier_reward/centered_abs_mean": 0.14500512927770615,
"signal/brier_reward/group_std_mean": 0.1952369064092636,
"signal/brier_reward/group_zero_std_frac": 0.002777777798473835,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.014500514045357705,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.014500514045357705,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.1249560296535492,
"signal/confidence_uniqueness_reward/group_std_mean": 0.15818934738636017,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.002777777798473835,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.012495603226125241,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.012495603226125241,
"signal/format_reward/centered_abs_mean": 0.01147460942156613,
"signal/format_reward/group_std_mean": 0.027114569582045077,
"signal/format_reward/group_zero_std_frac": 0.8666666626930237,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.005737304710783065,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.005737304710783065,
"signal/frontier_aurc_reward/centered_abs_mean": 0.002263281703926623,
"signal/frontier_aurc_reward/group_std_mean": 0.0033607793506234885,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.8291022317716852e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.8291022317716852e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.025421417877078058,
"signal/frontier_ece_reward/group_std_mean": 0.03576734662055969,
"signal/frontier_ece_reward/group_zero_std_frac": 0.00555555559694767,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.002542141871526837,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.002542141871526837,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3209464192390442,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.40138447284698486,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.06944444514811039,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.032094644755125044,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.032094644755125044,
"step": 95
},
{
"calibration/aurc": 0.14871540129140545,
"calibration/batch_distribution_entropy": 0.7226482654847846,
"calibration/buffer_distribution_entropy": 0.6533304306820955,
"calibration/confidence_entropy": 0.2790756192218355,
"calibration/coverage@0%": 0.022273176238182587,
"calibration/coverage@1%": 0.022273176238182587,
"calibration/coverage@10%": 0.384944260136185,
"calibration/coverage@15%": 0.550066165769163,
"calibration/coverage@20%": 0.7874869814759887,
"calibration/coverage@25%": 0.8813192236663167,
"calibration/coverage@30%": 0.9200326500742367,
"calibration/coverage@5%": 0.22022361653824696,
"calibration/ece": 0.10801010513597523,
"calibration/mean_confidence": 0.6640190515941942,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00902777777777779,
"completions/max_length": 3604.8,
"completions/max_terminated_length": 3604.8,
"completions/mean_length": 849.1434204101563,
"completions/mean_terminated_length": 856.893017578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 251.8,
"epoch": 0.23999700003749952,
"grad_norm": 0.0005327390390448272,
"learning_rate": 3.2530120481927713e-06,
"loss": -0.0083,
"num_tokens": 220676549.0,
"reward": 1.2861318826675414,
"reward_std": 0.2074308067560196,
"rewards/accuracy_reward": 0.6497395873069763,
"rewards/batch_coverage_0": 0.406613689661026,
"rewards/batch_coverage_1": 0.406613689661026,
"rewards/batch_coverage_10": 0.5238588929176331,
"rewards/batch_coverage_15": 0.5399280548095703,
"rewards/batch_coverage_20": 0.5521279692649841,
"rewards/batch_coverage_25": 0.5576201200485229,
"rewards/batch_coverage_5": 0.48190149664878845,
"rewards/brier_reward": 0.8477397084236145,
"rewards/confidence_uniqueness_reward": 0.8880740642547608,
"rewards/format_reward": 0.990711796283722,
"rewards/frontier_aurc_reward": -0.0011223536217585205,
"rewards/frontier_ece_reward": 0.0243696141988039,
"rewards/frontier_entropy_batch_reward": -0.5696449875831604,
"signal/accuracy_reward/centered_abs_mean": 0.17262912094593047,
"signal/accuracy_reward/group_std_mean": 0.22270674109458924,
"signal/accuracy_reward/group_zero_std_frac": 0.38611111640930174,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08631456047296523,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08631456047296523,
"signal/advantage_abs_mean": 0.14957348108291627,
"signal/advantage_pre_scale_abs_mean": 0.14957348108291627,
"signal/advantage_pre_scale_std": 0.23985751271247863,
"signal/advantage_std": 0.23985751271247863,
"signal/batch_coverage_0/centered_abs_mean": 0.14382635056972504,
"signal/batch_coverage_0/group_std_mean": 0.19585117995738982,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.014382634684443474,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.014382634684443474,
"signal/batch_coverage_1/centered_abs_mean": 0.14382635056972504,
"signal/batch_coverage_1/group_std_mean": 0.19585117995738982,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.014382634684443474,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.014382634684443474,
"signal/batch_coverage_10/centered_abs_mean": 0.1680520087480545,
"signal/batch_coverage_10/group_std_mean": 0.23160885274410248,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.016805201396346094,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.016805201396346094,
"signal/batch_coverage_15/centered_abs_mean": 0.17198814153671266,
"signal/batch_coverage_15/group_std_mean": 0.23695962727069855,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01719881482422352,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01719881482422352,
"signal/batch_coverage_20/centered_abs_mean": 0.17785200774669646,
"signal/batch_coverage_20/group_std_mean": 0.2448043555021286,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.017785200849175453,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.017785200849175453,
"signal/batch_coverage_25/centered_abs_mean": 0.18249513506889342,
"signal/batch_coverage_25/group_std_mean": 0.24947845041751862,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.01824951358139515,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.01824951358139515,
"signal/batch_coverage_5/centered_abs_mean": 0.15562867522239685,
"signal/batch_coverage_5/group_std_mean": 0.21346194446086883,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.015562868677079678,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.015562868677079678,
"signal/brier_reward/centered_abs_mean": 0.1374327450990677,
"signal/brier_reward/group_std_mean": 0.1854398012161255,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013743274100124835,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013743274100124835,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06414824798703193,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09137060940265655,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.006414824724197387,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006414824724197387,
"signal/format_reward/centered_abs_mean": 0.016845703311264514,
"signal/format_reward/group_std_mean": 0.0385689489543438,
"signal/format_reward/group_zero_std_frac": 0.8166666746139526,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.008422851655632257,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.008422851655632257,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0011418449808843433,
"signal/frontier_aurc_reward/group_std_mean": 0.0017946111736819148,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.4273062515712808e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.4273062515712808e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.025530267134308816,
"signal/frontier_ece_reward/group_std_mean": 0.037098944932222364,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.002553026657551527,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.002553026657551527,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3181028127670288,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3966158747673035,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.06944444645196199,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.031810281053185466,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.031810281053185466,
"step": 100
},
{
"epoch": 0.23999700003749952,
"eval_calibration/aurc": 0.15069789295964053,
"eval_calibration/batch_distribution_entropy": 0.6999939081758847,
"eval_calibration/buffer_distribution_entropy": 0.6590732287086882,
"eval_calibration/confidence_entropy": 0.2877143448048169,
"eval_calibration/coverage@0%": 0.2820900537634408,
"eval_calibration/coverage@1%": 0.2820900537634408,
"eval_calibration/coverage@10%": 0.3706317204301075,
"eval_calibration/coverage@15%": 0.5504928315412186,
"eval_calibration/coverage@20%": 0.7736447132616489,
"eval_calibration/coverage@25%": 0.8738015232974911,
"eval_calibration/coverage@30%": 0.9583333333333334,
"eval_calibration/coverage@5%": 0.3185483870967742,
"eval_calibration/ece": 0.17312003647113128,
"eval_calibration/mean_confidence": 0.6685240156767175,
"eval_completions/clipped_ratio": 0.00868055555555558,
"eval_completions/max_length": 2403.3333333333335,
"eval_completions/max_terminated_length": 2403.3333333333335,
"eval_completions/mean_length": 861.4699910481771,
"eval_completions/mean_terminated_length": 868.889882405599,
"eval_completions/min_length": 115.66666666666667,
"eval_completions/min_terminated_length": 322.8333333333333,
"eval_loss": 0.0,
"eval_num_tokens": 220676549.0,
"eval_reward": 1.0047101080417633,
"eval_reward_std": 0.29811317722002667,
"eval_rewards/accuracy_reward": 0.6388888955116272,
"eval_rewards/batch_coverage_0": 0.12254629905025165,
"eval_rewards/batch_coverage_1": 0.12254629905025165,
"eval_rewards/batch_coverage_10": 0.12550141910711923,
"eval_rewards/batch_coverage_15": 0.16085403288404146,
"eval_rewards/batch_coverage_20": 0.22410336136817932,
"eval_rewards/batch_coverage_25": 0.29980478684107464,
"eval_rewards/batch_coverage_5": 0.12447089081009229,
"eval_rewards/brier_reward": 0.8346987068653107,
"eval_rewards/confidence_uniqueness_reward": 0.8586996992429098,
"eval_rewards/format_reward": 0.9895833432674408,
"eval_rewards/frontier_aurc_reward": -0.001299240723407517,
"eval_rewards/frontier_ece_reward": 0.021259518650670845,
"eval_rewards/frontier_entropy_batch_reward": -0.9895833432674408,
"eval_runtime": 196.7066,
"eval_samples_per_second": 5.084,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4511718700329463,
"eval_signal/accuracy_reward/group_std_mean": 0.48224946359793347,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.22558593501647314,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.22558593501647314,
"eval_signal/advantage_abs_mean": 0.24452855934699377,
"eval_signal/advantage_pre_scale_abs_mean": 0.24452855934699377,
"eval_signal/advantage_pre_scale_std": 0.29801690578460693,
"eval_signal/advantage_std": 0.29801690578460693,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.2687097614010175,
"eval_signal/batch_coverage_0/group_std_mean": 0.372615580757459,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.026870975581308205,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.026870975581308205,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.2687097614010175,
"eval_signal/batch_coverage_1/group_std_mean": 0.372615580757459,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.026870975581308205,
"eval_signal/batch_coverage_1/weight": 0.10000000149011612,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.026870975581308205,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.2330656349658966,
"eval_signal/batch_coverage_10/group_std_mean": 0.32191962500413257,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.023306564427912235,
"eval_signal/batch_coverage_10/weight": 0.10000000149011612,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.023306564427912235,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.22498319546381632,
"eval_signal/batch_coverage_15/group_std_mean": 0.29725244144598645,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.022498319546381634,
"eval_signal/batch_coverage_15/weight": 0.10000000149011612,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.022498319546381634,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.273581658800443,
"eval_signal/batch_coverage_20/group_std_mean": 0.33339178065458935,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.027358165321250755,
"eval_signal/batch_coverage_20/weight": 0.10000000149011612,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.027358165321250755,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.35976608097553253,
"eval_signal/batch_coverage_25/group_std_mean": 0.4202146033445994,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.035976607662936054,
"eval_signal/batch_coverage_25/weight": 0.10000000149011612,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.035976607662936054,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.26241545875867206,
"eval_signal/batch_coverage_5/group_std_mean": 0.3641966183980306,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.026241544944544632,
"eval_signal/batch_coverage_5/weight": 0.10000000149011612,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.026241544944544632,
"eval_signal/brier_reward/centered_abs_mean": 0.2232651188969612,
"eval_signal/brier_reward/group_std_mean": 0.2941201577583949,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.02232651226222515,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.02232651226222515,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.07532106898725033,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.10399662268658479,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.007532107022901376,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007532107022901376,
"eval_signal/format_reward/centered_abs_mean": 0.019856771143774193,
"eval_signal/format_reward/group_std_mean": 0.049957338720560074,
"eval_signal/format_reward/group_zero_std_frac": 0.750000019868215,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.009928385571887096,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.009928385571887096,
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.0018068959082787235,
"eval_signal/frontier_aurc_reward/group_std_mean": 0.0034212637498664358,
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.2586199823611725e-05,
"eval_signal/frontier_aurc_reward/weight": 0.012500000186264515,
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.2586199823611725e-05,
"eval_signal/frontier_ece_reward/centered_abs_mean": 0.029543430544435978,
"eval_signal/frontier_ece_reward/group_std_mean": 0.0442526334275802,
"eval_signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0029543431010097265,
"eval_signal/frontier_ece_reward/weight": 0.10000000149011612,
"eval_signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0029543431010097265,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.019856771143774193,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.049957338720560074,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.750000019868215,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.001985677246314784,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.001985677246314784,
"eval_steps_per_second": 0.031,
"step": 100
},
{
"calibration/aurc": 0.27275814168966306,
"calibration/batch_distribution_entropy": 0.6649876038103061,
"calibration/buffer_distribution_entropy": 0.6616258775307035,
"calibration/confidence_entropy": 0.2404086982419873,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.18453443148021048,
"calibration/coverage@15%": 0.21867993594143215,
"calibration/coverage@20%": 0.42895934173473194,
"calibration/coverage@25%": 0.4990613706952415,
"calibration/coverage@30%": 0.5436143262171786,
"calibration/coverage@5%": 0.1546328071379547,
"calibration/ece": 0.16556052440644764,
"calibration/mean_confidence": 0.6860977484739768,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00972222222222221,
"completions/max_length": 3273.8,
"completions/max_terminated_length": 3273.8,
"completions/mean_length": 859.6019165039063,
"completions/mean_terminated_length": 868.0409423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 260.0,
"epoch": 0.2519968500393745,
"grad_norm": 0.0006035579717718065,
"learning_rate": 3.1024096385542172e-06,
"loss": -0.0095,
"num_tokens": 233656027.0,
"reward": 1.2749878406524657,
"reward_std": 0.20778575241565705,
"rewards/accuracy_reward": 0.6319444417953491,
"rewards/batch_coverage_0": 0.4026776671409607,
"rewards/batch_coverage_1": 0.4026776671409607,
"rewards/batch_coverage_10": 0.5386986613273621,
"rewards/batch_coverage_15": 0.5527087450027466,
"rewards/batch_coverage_20": 0.5678210139274598,
"rewards/batch_coverage_25": 0.5734583020210267,
"rewards/batch_coverage_5": 0.47398912310600283,
"rewards/brier_reward": 0.8429219245910644,
"rewards/confidence_uniqueness_reward": 0.8753986597061157,
"rewards/format_reward": 0.9901041626930237,
"rewards/frontier_aurc_reward": -0.0013104168232530356,
"rewards/frontier_ece_reward": 0.02432958744466305,
"rewards/frontier_entropy_batch_reward": -0.6148821473121643,
"signal/accuracy_reward/centered_abs_mean": 0.16476779580116271,
"signal/accuracy_reward/group_std_mean": 0.21900846362113952,
"signal/accuracy_reward/group_zero_std_frac": 0.38055555820465087,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08238389790058136,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08238389790058136,
"signal/advantage_abs_mean": 0.15073422193527222,
"signal/advantage_pre_scale_abs_mean": 0.15073422193527222,
"signal/advantage_pre_scale_std": 0.2431449383497238,
"signal/advantage_std": 0.2431449383497238,
"signal/batch_coverage_0/centered_abs_mean": 0.13495715856552123,
"signal/batch_coverage_0/group_std_mean": 0.18429154455661773,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.013495716080069543,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013495716080069543,
"signal/batch_coverage_1/centered_abs_mean": 0.13495715856552123,
"signal/batch_coverage_1/group_std_mean": 0.18429154455661773,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.013495716080069543,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.013495716080069543,
"signal/batch_coverage_10/centered_abs_mean": 0.1662070333957672,
"signal/batch_coverage_10/group_std_mean": 0.2270430624485016,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01662070322781801,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01662070322781801,
"signal/batch_coverage_15/centered_abs_mean": 0.17070157825946808,
"signal/batch_coverage_15/group_std_mean": 0.23345741331577302,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.017070158571004867,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.017070158571004867,
"signal/batch_coverage_20/centered_abs_mean": 0.18138117492198944,
"signal/batch_coverage_20/group_std_mean": 0.24707195162773132,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.018138118088245392,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.018138118088245392,
"signal/batch_coverage_25/centered_abs_mean": 0.18659622073173524,
"signal/batch_coverage_25/group_std_mean": 0.2526919931173325,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.018659623339772224,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.018659623339772224,
"signal/batch_coverage_5/centered_abs_mean": 0.1455353856086731,
"signal/batch_coverage_5/group_std_mean": 0.19852421581745147,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.014553537964820862,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.014553537964820862,
"signal/brier_reward/centered_abs_mean": 0.1374508783221245,
"signal/brier_reward/group_std_mean": 0.1849253445863724,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013745087757706642,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013745087757706642,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.0681469388306141,
"signal/confidence_uniqueness_reward/group_std_mean": 0.0911123812198639,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.006814693752676249,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006814693752676249,
"signal/format_reward/centered_abs_mean": 0.01706814244389534,
"signal/format_reward/group_std_mean": 0.033369756489992144,
"signal/format_reward/group_zero_std_frac": 0.8555555582046509,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00853407122194767,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00853407122194767,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0012912088888697327,
"signal/frontier_aurc_reward/group_std_mean": 0.002038556197658181,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.614011162018869e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.614011162018869e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.025292347371578216,
"signal/frontier_ece_reward/group_std_mean": 0.03649614751338959,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0025292347185313702,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0025292347185313702,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32601881623268125,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.4004940211772919,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.08055555745959282,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.03260188177227974,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03260188177227974,
"step": 105
},
{
"calibration/aurc": 0.13600304996634355,
"calibration/batch_distribution_entropy": 0.5387418942015992,
"calibration/buffer_distribution_entropy": 0.6603980686405777,
"calibration/confidence_entropy": 0.1700767895993173,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.06649214659685863,
"calibration/coverage@10%": 0.4305625432228937,
"calibration/coverage@15%": 0.6513986521999992,
"calibration/coverage@20%": 0.7441999017551388,
"calibration/coverage@25%": 0.8238441682474911,
"calibration/coverage@30%": 0.8981990649806256,
"calibration/coverage@5%": 0.23545211605584643,
"calibration/ece": 0.14491282588435714,
"calibration/mean_confidence": 0.7337704927321872,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007899305555555559,
"completions/max_length": 3590.2,
"completions/max_terminated_length": 3590.2,
"completions/mean_length": 890.5359497070312,
"completions/mean_terminated_length": 897.662841796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 279.6,
"epoch": 0.2639967000412495,
"grad_norm": 0.0005900281248614192,
"learning_rate": 2.9518072289156627e-06,
"loss": -0.0087,
"num_tokens": 247023449.0,
"reward": 1.3070582389831542,
"reward_std": 0.20293028652668,
"rewards/accuracy_reward": 0.6640624880790711,
"rewards/batch_coverage_0": 0.4146144509315491,
"rewards/batch_coverage_1": 0.4146144509315491,
"rewards/batch_coverage_10": 0.5749109148979187,
"rewards/batch_coverage_15": 0.5923280835151672,
"rewards/batch_coverage_20": 0.6081398606300354,
"rewards/batch_coverage_25": 0.6159737706184387,
"rewards/batch_coverage_5": 0.5078445851802826,
"rewards/brier_reward": 0.8550400853157043,
"rewards/confidence_uniqueness_reward": 0.8103942751884461,
"rewards/format_reward": 0.9920138835906982,
"rewards/frontier_aurc_reward": -0.0012431937037035822,
"rewards/frontier_ece_reward": 0.0255121573805809,
"rewards/frontier_entropy_batch_reward": -0.6290174722671509,
"signal/accuracy_reward/centered_abs_mean": 0.1595486134290695,
"signal/accuracy_reward/group_std_mean": 0.21389709711074828,
"signal/accuracy_reward/group_zero_std_frac": 0.375,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07977430671453475,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07977430671453475,
"signal/advantage_abs_mean": 0.1432841181755066,
"signal/advantage_pre_scale_abs_mean": 0.1432841181755066,
"signal/advantage_pre_scale_std": 0.23976631164550782,
"signal/advantage_std": 0.23976631164550782,
"signal/batch_coverage_0/centered_abs_mean": 0.1273946210741997,
"signal/batch_coverage_0/group_std_mean": 0.17782883048057557,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012739462591707707,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012739462591707707,
"signal/batch_coverage_1/centered_abs_mean": 0.1273946210741997,
"signal/batch_coverage_1/group_std_mean": 0.17782883048057557,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012739462591707707,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012739462591707707,
"signal/batch_coverage_10/centered_abs_mean": 0.1554081290960312,
"signal/batch_coverage_10/group_std_mean": 0.22439009249210357,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.015540812350809573,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.015540812350809573,
"signal/batch_coverage_15/centered_abs_mean": 0.16385475397109986,
"signal/batch_coverage_15/group_std_mean": 0.23465842604637147,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.016385475546121596,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.016385475546121596,
"signal/batch_coverage_20/centered_abs_mean": 0.1737336367368698,
"signal/batch_coverage_20/group_std_mean": 0.24739258289337157,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.017373364046216012,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.017373364046216012,
"signal/batch_coverage_25/centered_abs_mean": 0.18111605048179627,
"signal/batch_coverage_25/group_std_mean": 0.2562800019979477,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.01811160519719124,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.01811160519719124,
"signal/batch_coverage_5/centered_abs_mean": 0.13753320872783661,
"signal/batch_coverage_5/group_std_mean": 0.19644954800605774,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.013753321021795273,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.013753321021795273,
"signal/brier_reward/centered_abs_mean": 0.13349896520376206,
"signal/brier_reward/group_std_mean": 0.18303124904632567,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013349897041916848,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013349897041916848,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.09162542074918748,
"signal/confidence_uniqueness_reward/group_std_mean": 0.1198175922036171,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.009162542037665844,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.009162542037665844,
"signal/format_reward/centered_abs_mean": 0.01412760429084301,
"signal/format_reward/group_std_mean": 0.0280547920614481,
"signal/format_reward/group_zero_std_frac": 0.8805555582046509,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007063802145421505,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007063802145421505,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0014921376714482903,
"signal/frontier_aurc_reward/group_std_mean": 0.0024413310457021,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.865172198449727e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.865172198449727e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.021755291894078255,
"signal/frontier_ece_reward/group_std_mean": 0.03146950043737888,
"signal/frontier_ece_reward/group_zero_std_frac": 0.01111111119389534,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0021755292546004057,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0021755292546004057,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.29078406691551206,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.36565481424331664,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.14166666716337203,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.02907840646803379,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.02907840646803379,
"step": 110
},
{
"calibration/aurc": 0.24142886549835296,
"calibration/batch_distribution_entropy": 0.6974169468707274,
"calibration/buffer_distribution_entropy": 0.6602675731548552,
"calibration/confidence_entropy": 0.25641283231726597,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.0,
"calibration/coverage@10%": 0.1538266635798299,
"calibration/coverage@15%": 0.45309955988777517,
"calibration/coverage@20%": 0.5999502038930988,
"calibration/coverage@25%": 0.6443263312855432,
"calibration/coverage@30%": 0.6900952333916923,
"calibration/coverage@5%": 0.04128686327077748,
"calibration/ece": 0.16795997870097873,
"calibration/mean_confidence": 0.665753816688968,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008420138888888883,
"completions/max_length": 3502.0,
"completions/max_terminated_length": 3502.0,
"completions/mean_length": 925.295068359375,
"completions/mean_terminated_length": 933.1673828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 290.6,
"epoch": 0.27599655004312446,
"grad_norm": 0.0004857274179812521,
"learning_rate": 2.8012048192771087e-06,
"loss": -0.0094,
"num_tokens": 260762048.0,
"reward": 1.2888626337051392,
"reward_std": 0.1940772384405136,
"rewards/accuracy_reward": 0.63046875,
"rewards/batch_coverage_0": 0.42994843125343324,
"rewards/batch_coverage_1": 0.42994843125343324,
"rewards/batch_coverage_10": 0.5508076071739196,
"rewards/batch_coverage_15": 0.5608129739761353,
"rewards/batch_coverage_20": 0.5743337988853454,
"rewards/batch_coverage_25": 0.5797275543212891,
"rewards/batch_coverage_5": 0.5175644040107727,
"rewards/brier_reward": 0.8383445382118225,
"rewards/confidence_uniqueness_reward": 0.8627377271652221,
"rewards/format_reward": 0.9909722208976746,
"rewards/frontier_aurc_reward": -0.0014342849142849445,
"rewards/frontier_ece_reward": 0.021663056686520576,
"rewards/frontier_entropy_batch_reward": -0.5842878341674804,
"signal/accuracy_reward/centered_abs_mean": 0.15357530415058135,
"signal/accuracy_reward/group_std_mean": 0.20156333446502686,
"signal/accuracy_reward/group_zero_std_frac": 0.42222222685813904,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07678765207529067,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07678765207529067,
"signal/advantage_abs_mean": 0.13954857736825943,
"signal/advantage_pre_scale_abs_mean": 0.13954857736825943,
"signal/advantage_pre_scale_std": 0.2291879892349243,
"signal/advantage_std": 0.2291879892349243,
"signal/batch_coverage_0/centered_abs_mean": 0.13936588019132615,
"signal/batch_coverage_0/group_std_mean": 0.18808189630508423,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.013936588354408742,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013936588354408742,
"signal/batch_coverage_1/centered_abs_mean": 0.13936588019132615,
"signal/batch_coverage_1/group_std_mean": 0.18808189630508423,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.013936588354408742,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.013936588354408742,
"signal/batch_coverage_10/centered_abs_mean": 0.16270072162151336,
"signal/batch_coverage_10/group_std_mean": 0.2255598098039627,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.016270072013139725,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.016270072013139725,
"signal/batch_coverage_15/centered_abs_mean": 0.1643664598464966,
"signal/batch_coverage_15/group_std_mean": 0.228114452958107,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01643664576113224,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01643664576113224,
"signal/batch_coverage_20/centered_abs_mean": 0.1735037475824356,
"signal/batch_coverage_20/group_std_mean": 0.23918266594409943,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.017350374720990658,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.017350374720990658,
"signal/batch_coverage_25/centered_abs_mean": 0.17976273596286774,
"signal/batch_coverage_25/group_std_mean": 0.24600136280059814,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.017976274341344835,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.017976274341344835,
"signal/batch_coverage_5/centered_abs_mean": 0.15355750322341918,
"signal/batch_coverage_5/group_std_mean": 0.21199324131011962,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.015355750359594822,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.015355750359594822,
"signal/brier_reward/centered_abs_mean": 0.13386404365301133,
"signal/brier_reward/group_std_mean": 0.17856364250183104,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013386405259370803,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013386405259370803,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06989814937114716,
"signal/confidence_uniqueness_reward/group_std_mean": 0.0943081557750702,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.006989815179258585,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006989815179258585,
"signal/format_reward/centered_abs_mean": 0.015766059048473835,
"signal/format_reward/group_std_mean": 0.03223867490887642,
"signal/format_reward/group_zero_std_frac": 0.8583333373069764,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007883029524236917,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007883029524236917,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0012373300269246101,
"signal/frontier_aurc_reward/group_std_mean": 0.0018391921184957027,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.5466626064153387e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.5466626064153387e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.022429964691400527,
"signal/frontier_ece_reward/group_std_mean": 0.03295571245253086,
"signal/frontier_ece_reward/group_zero_std_frac": 0.002777777798473835,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.002242996543645859,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.002242996543645859,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3123690664768219,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3877885460853577,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0916666679084301,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.031236908584833144,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.031236908584833144,
"step": 115
},
{
"calibration/aurc": 0.23224894048684197,
"calibration/batch_distribution_entropy": 0.6577461754018263,
"calibration/buffer_distribution_entropy": 0.6644530368239384,
"calibration/confidence_entropy": 0.21956154023502678,
"calibration/coverage@0%": 0.030366492146596858,
"calibration/coverage@1%": 0.1,
"calibration/coverage@10%": 0.36432955586237065,
"calibration/coverage@15%": 0.4144833431301519,
"calibration/coverage@20%": 0.4844914084179733,
"calibration/coverage@25%": 0.5257761130780694,
"calibration/coverage@30%": 0.5780309761732261,
"calibration/coverage@5%": 0.2661244241521195,
"calibration/ece": 0.17240632915410462,
"calibration/mean_confidence": 0.6587200250674344,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0059895833333333485,
"completions/max_length": 3442.2,
"completions/max_terminated_length": 3442.2,
"completions/mean_length": 931.5639038085938,
"completions/mean_terminated_length": 937.10859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 316.8,
"epoch": 0.28799640004499943,
"grad_norm": 0.0004797873261850327,
"learning_rate": 2.6506024096385547e-06,
"loss": -0.0053,
"num_tokens": 274575520.0,
"reward": 1.3031680583953857,
"reward_std": 0.18711221516132354,
"rewards/accuracy_reward": 0.6480902671813965,
"rewards/batch_coverage_0": 0.4340886354446411,
"rewards/batch_coverage_1": 0.4340886354446411,
"rewards/batch_coverage_10": 0.5644845604896546,
"rewards/batch_coverage_15": 0.5737747311592102,
"rewards/batch_coverage_20": 0.5859205961227417,
"rewards/batch_coverage_25": 0.588034474849701,
"rewards/batch_coverage_5": 0.5137127637863159,
"rewards/brier_reward": 0.8417695760726929,
"rewards/confidence_uniqueness_reward": 0.8399509191513062,
"rewards/format_reward": 0.9940104246139526,
"rewards/frontier_aurc_reward": -0.001311807334423065,
"rewards/frontier_ece_reward": 0.021232662722468376,
"rewards/frontier_entropy_batch_reward": -0.575716894865036,
"signal/accuracy_reward/centered_abs_mean": 0.1477213516831398,
"signal/accuracy_reward/group_std_mean": 0.19991495907306672,
"signal/accuracy_reward/group_zero_std_frac": 0.4138888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0738606758415699,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.0738606758415699,
"signal/advantage_abs_mean": 0.13333643972873688,
"signal/advantage_pre_scale_abs_mean": 0.13333643972873688,
"signal/advantage_pre_scale_std": 0.22238788306713103,
"signal/advantage_std": 0.22238788306713103,
"signal/batch_coverage_0/centered_abs_mean": 0.1329650953412056,
"signal/batch_coverage_0/group_std_mean": 0.18248933255672456,
"signal/batch_coverage_0/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.013296510092914104,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013296510092914104,
"signal/batch_coverage_1/centered_abs_mean": 0.1329650953412056,
"signal/batch_coverage_1/group_std_mean": 0.18248933255672456,
"signal/batch_coverage_1/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.013296510092914104,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.013296510092914104,
"signal/batch_coverage_10/centered_abs_mean": 0.15834161937236785,
"signal/batch_coverage_10/group_std_mean": 0.22070887088775634,
"signal/batch_coverage_10/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.015834162198007106,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.015834162198007106,
"signal/batch_coverage_15/centered_abs_mean": 0.16037201285362243,
"signal/batch_coverage_15/group_std_mean": 0.22399567663669587,
"signal/batch_coverage_15/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.016037201695144176,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.016037201695144176,
"signal/batch_coverage_20/centered_abs_mean": 0.1704261153936386,
"signal/batch_coverage_20/group_std_mean": 0.2371358036994934,
"signal/batch_coverage_20/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.017042612284421922,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.017042612284421922,
"signal/batch_coverage_25/centered_abs_mean": 0.16791126430034636,
"signal/batch_coverage_25/group_std_mean": 0.23394818603992462,
"signal/batch_coverage_25/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.016791127249598505,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.016791127249598505,
"signal/batch_coverage_5/centered_abs_mean": 0.14462463855743407,
"signal/batch_coverage_5/group_std_mean": 0.20006984174251558,
"signal/batch_coverage_5/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.014462463743984699,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.014462463743984699,
"signal/brier_reward/centered_abs_mean": 0.12397899478673935,
"signal/brier_reward/group_std_mean": 0.17036497890949248,
"signal/brier_reward/group_zero_std_frac": 0.002777777798473835,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.012397900223731995,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012397900223731995,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07810356169939041,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10339468717575073,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.002777777798473835,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.007810356188565492,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007810356188565492,
"signal/format_reward/centered_abs_mean": 0.01047634556889534,
"signal/format_reward/group_std_mean": 0.02356223687529564,
"signal/format_reward/group_zero_std_frac": 0.8861111164093017,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00523817278444767,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00523817278444767,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0011394575121812522,
"signal/frontier_aurc_reward/group_std_mean": 0.0017364894039928914,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.4243219266063533e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.4243219266063533e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.020640724897384645,
"signal/frontier_ece_reward/group_std_mean": 0.030754799023270607,
"signal/frontier_ece_reward/group_zero_std_frac": 0.016666666977107523,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0020640727132558824,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0020640727132558824,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.31634058952331545,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3934920966625214,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.07222222313284873,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.03163406066596508,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03163406066596508,
"step": 120
},
{
"calibration/aurc": 0.13443913725346507,
"calibration/batch_distribution_entropy": 0.5604718237969802,
"calibration/buffer_distribution_entropy": 0.6658594200820985,
"calibration/confidence_entropy": 0.17788369083866884,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.10520833333333332,
"calibration/coverage@10%": 0.5163322368421053,
"calibration/coverage@15%": 0.5758991228070175,
"calibration/coverage@20%": 0.651672149122807,
"calibration/coverage@25%": 0.8831524122807017,
"calibration/coverage@30%": 0.9422916666666665,
"calibration/coverage@5%": 0.20729166666666665,
"calibration/ece": 0.11687316169530022,
"calibration/mean_confidence": 0.7484829133562686,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005381944444444442,
"completions/max_length": 3468.8,
"completions/max_terminated_length": 3468.8,
"completions/mean_length": 957.4337890625,
"completions/mean_terminated_length": 962.6183471679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 340.2,
"epoch": 0.2999962500468744,
"grad_norm": 0.0005288660177029669,
"learning_rate": 2.5e-06,
"loss": -0.0053,
"num_tokens": 288722821.0,
"reward": 1.3045886754989624,
"reward_std": 0.19153454899787903,
"rewards/accuracy_reward": 0.6677083253860474,
"rewards/batch_coverage_0": 0.4332961618900299,
"rewards/batch_coverage_1": 0.4332961618900299,
"rewards/batch_coverage_10": 0.566796088218689,
"rewards/batch_coverage_15": 0.5802444577217102,
"rewards/batch_coverage_20": 0.5940844774246216,
"rewards/batch_coverage_25": 0.5986591339111328,
"rewards/batch_coverage_5": 0.5141059577465057,
"rewards/brier_reward": 0.8566888213157654,
"rewards/confidence_uniqueness_reward": 0.7277114033699036,
"rewards/format_reward": 0.9946180582046509,
"rewards/frontier_aurc_reward": -0.0010761045676190406,
"rewards/frontier_ece_reward": 0.022585730999708176,
"rewards/frontier_entropy_batch_reward": -0.5930793166160584,
"signal/accuracy_reward/centered_abs_mean": 0.16424696147441864,
"signal/accuracy_reward/group_std_mean": 0.21489786207675934,
"signal/accuracy_reward/group_zero_std_frac": 0.39166666865348815,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08212348073720932,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08212348073720932,
"signal/advantage_abs_mean": 0.14050142765045165,
"signal/advantage_pre_scale_abs_mean": 0.14050142765045165,
"signal/advantage_pre_scale_std": 0.22893159091472626,
"signal/advantage_std": 0.22893159091472626,
"signal/batch_coverage_0/centered_abs_mean": 0.1296757608652115,
"signal/batch_coverage_0/group_std_mean": 0.1786007136106491,
"signal/batch_coverage_0/group_zero_std_frac": 0.027777778543531896,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012967575900256633,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012967575900256633,
"signal/batch_coverage_1/centered_abs_mean": 0.1296757608652115,
"signal/batch_coverage_1/group_std_mean": 0.1786007136106491,
"signal/batch_coverage_1/group_zero_std_frac": 0.027777778543531896,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012967575900256633,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012967575900256633,
"signal/batch_coverage_10/centered_abs_mean": 0.15659565031528472,
"signal/batch_coverage_10/group_std_mean": 0.2186544954776764,
"signal/batch_coverage_10/group_zero_std_frac": 0.025000000186264516,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01565956436097622,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01565956436097622,
"signal/batch_coverage_15/centered_abs_mean": 0.16400947272777558,
"signal/batch_coverage_15/group_std_mean": 0.22802729308605194,
"signal/batch_coverage_15/group_zero_std_frac": 0.025000000186264516,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.016400948539376257,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.016400948539376257,
"signal/batch_coverage_20/centered_abs_mean": 0.17170844078063965,
"signal/batch_coverage_20/group_std_mean": 0.2369672656059265,
"signal/batch_coverage_20/group_zero_std_frac": 0.025000000186264516,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.017170844972133635,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.017170844972133635,
"signal/batch_coverage_25/centered_abs_mean": 0.17332231402397155,
"signal/batch_coverage_25/group_std_mean": 0.23855508267879486,
"signal/batch_coverage_25/group_zero_std_frac": 0.025000000186264516,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.017332231998443602,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.017332231998443602,
"signal/batch_coverage_5/centered_abs_mean": 0.14173758327960967,
"signal/batch_coverage_5/group_std_mean": 0.19865359365940094,
"signal/batch_coverage_5/group_zero_std_frac": 0.027777778543531896,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.014173758402466773,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.014173758402466773,
"signal/brier_reward/centered_abs_mean": 0.13510052263736724,
"signal/brier_reward/group_std_mean": 0.17892874777317047,
"signal/brier_reward/group_zero_std_frac": 0.025000000186264516,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013510052487254143,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013510052487254143,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.12244658917188644,
"signal/confidence_uniqueness_reward/group_std_mean": 0.15771516263484955,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.025000000186264516,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.012244659289717675,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.012244659289717675,
"signal/format_reward/centered_abs_mean": 0.009570312406867742,
"signal/format_reward/group_std_mean": 0.019770674780011176,
"signal/format_reward/group_zero_std_frac": 0.9111111164093018,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.004785156203433871,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.004785156203433871,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0017101092729717494,
"signal/frontier_aurc_reward/group_std_mean": 0.0027070957468822597,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.137636729457881e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.137636729457881e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.019067096710205077,
"signal/frontier_ece_reward/group_std_mean": 0.028569764271378518,
"signal/frontier_ece_reward/group_zero_std_frac": 0.06388889066874981,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0019067097455263138,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0019067097455263138,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.29008581936359407,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3632409691810608,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.15277777910232543,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.02900858223438263,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.02900858223438263,
"step": 125
},
{
"calibration/aurc": 0.19127228185204476,
"calibration/batch_distribution_entropy": 0.5895829262380423,
"calibration/buffer_distribution_entropy": 0.6624590738418712,
"calibration/confidence_entropy": 0.19593788092214626,
"calibration/coverage@0%": 0.07783524408961186,
"calibration/coverage@1%": 0.13527649735331942,
"calibration/coverage@10%": 0.30391945128015657,
"calibration/coverage@15%": 0.39742277312595503,
"calibration/coverage@20%": 0.44843057846775736,
"calibration/coverage@25%": 0.7765386195340307,
"calibration/coverage@30%": 0.8501852970922469,
"calibration/coverage@5%": 0.21134445716307898,
"calibration/ece": 0.12887487159762867,
"calibration/mean_confidence": 0.6930903424604881,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00694444444444442,
"completions/max_length": 3430.0,
"completions/max_terminated_length": 3430.0,
"completions/mean_length": 1003.2494140625,
"completions/mean_terminated_length": 1010.2728271484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 293.6,
"epoch": 0.3119961000487494,
"grad_norm": 0.0005818709614686668,
"learning_rate": 2.349397590361446e-06,
"loss": -0.0075,
"num_tokens": 303405054.0,
"reward": 1.2743345260620118,
"reward_std": 0.2015679657459259,
"rewards/accuracy_reward": 0.6296007037162781,
"rewards/batch_coverage_0": 0.4014659643173218,
"rewards/batch_coverage_1": 0.4014659643173218,
"rewards/batch_coverage_10": 0.5354062855243683,
"rewards/batch_coverage_15": 0.5588637709617614,
"rewards/batch_coverage_20": 0.5805766224861145,
"rewards/batch_coverage_25": 0.5847961187362671,
"rewards/batch_coverage_5": 0.4904080927371979,
"rewards/brier_reward": 0.8410807013511657,
"rewards/confidence_uniqueness_reward": 0.8179298520088196,
"rewards/format_reward": 0.99296875,
"rewards/frontier_aurc_reward": -0.0014736841316334904,
"rewards/frontier_ece_reward": 0.01973484009504318,
"rewards/frontier_entropy_batch_reward": -0.6010460972785949,
"signal/accuracy_reward/centered_abs_mean": 0.17263997793197633,
"signal/accuracy_reward/group_std_mean": 0.22644833624362945,
"signal/accuracy_reward/group_zero_std_frac": 0.3638888895511627,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08631998896598816,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08631998896598816,
"signal/advantage_abs_mean": 0.14573590755462645,
"signal/advantage_pre_scale_abs_mean": 0.14573590755462645,
"signal/advantage_pre_scale_std": 0.23568201065063477,
"signal/advantage_std": 0.23568201065063477,
"signal/batch_coverage_0/centered_abs_mean": 0.13741378337144852,
"signal/batch_coverage_0/group_std_mean": 0.1913052588701248,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.013741378672420979,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013741378672420979,
"signal/batch_coverage_1/centered_abs_mean": 0.13741378337144852,
"signal/batch_coverage_1/group_std_mean": 0.1913052588701248,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.013741378672420979,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.013741378672420979,
"signal/batch_coverage_10/centered_abs_mean": 0.15665342509746552,
"signal/batch_coverage_10/group_std_mean": 0.2212431699037552,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.015665343031287193,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.015665343031287193,
"signal/batch_coverage_15/centered_abs_mean": 0.16616102159023285,
"signal/batch_coverage_15/group_std_mean": 0.23317878246307372,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.016616101562976836,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.016616101562976836,
"signal/batch_coverage_20/centered_abs_mean": 0.18078482151031494,
"signal/batch_coverage_20/group_std_mean": 0.25075055956840514,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.018078482896089553,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.018078482896089553,
"signal/batch_coverage_25/centered_abs_mean": 0.18434585630893707,
"signal/batch_coverage_25/group_std_mean": 0.254821240901947,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.018434586003422737,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.018434586003422737,
"signal/batch_coverage_5/centered_abs_mean": 0.1469416558742523,
"signal/batch_coverage_5/group_std_mean": 0.20604339838027955,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01469416581094265,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01469416581094265,
"signal/brier_reward/centered_abs_mean": 0.13546621352434157,
"signal/brier_reward/group_std_mean": 0.18270427882671356,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.013546621613204479,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.013546621613204479,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.09114307463169098,
"signal/confidence_uniqueness_reward/group_std_mean": 0.1203562155365944,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.009114306885749102,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.009114306885749102,
"signal/format_reward/centered_abs_mean": 0.013167317770421506,
"signal/format_reward/group_std_mean": 0.030222728475928308,
"signal/format_reward/group_zero_std_frac": 0.8583333373069764,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.006583658885210753,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006583658885210753,
"signal/frontier_aurc_reward/centered_abs_mean": 0.001271829183679074,
"signal/frontier_aurc_reward/group_std_mean": 0.0019367508590221404,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.5897865705483127e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.5897865705483127e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.019031322747468948,
"signal/frontier_ece_reward/group_std_mean": 0.029065588489174843,
"signal/frontier_ece_reward/group_zero_std_frac": 0.008333333395421505,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0019031323958188296,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0019031323958188296,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.28876765370368956,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3632201969623566,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.13888888955116271,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.028876765072345732,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.028876765072345732,
"step": 130
},
{
"calibration/aurc": 0.18872730403697074,
"calibration/batch_distribution_entropy": 0.6812975794088347,
"calibration/buffer_distribution_entropy": 0.6630834998201833,
"calibration/confidence_entropy": 0.2641302845207065,
"calibration/coverage@0%": 0.16066601049868767,
"calibration/coverage@1%": 0.186186843832021,
"calibration/coverage@10%": 0.3252712736565824,
"calibration/coverage@15%": 0.37397193155131925,
"calibration/coverage@20%": 0.5700034901472911,
"calibration/coverage@25%": 0.7193106086140599,
"calibration/coverage@30%": 0.7926389789838693,
"calibration/coverage@5%": 0.21066601049868766,
"calibration/ece": 0.1480757767000653,
"calibration/mean_confidence": 0.7052663474885803,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00815972222222221,
"completions/max_length": 3630.4,
"completions/max_terminated_length": 3630.4,
"completions/mean_length": 958.3525268554688,
"completions/mean_terminated_length": 966.2191772460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 306.4,
"epoch": 0.32399595005062437,
"grad_norm": 0.0005636845016852021,
"learning_rate": 2.1987951807228917e-06,
"loss": -0.0079,
"num_tokens": 317538299.0,
"reward": 1.2903289556503297,
"reward_std": 0.200746613740921,
"rewards/accuracy_reward": 0.635937511920929,
"rewards/batch_coverage_0": 0.4029585897922516,
"rewards/batch_coverage_1": 0.4029585897922516,
"rewards/batch_coverage_10": 0.5487715125083923,
"rewards/batch_coverage_15": 0.560142207145691,
"rewards/batch_coverage_20": 0.5713020324707031,
"rewards/batch_coverage_25": 0.5746458768844604,
"rewards/batch_coverage_5": 0.4942119956016541,
"rewards/brier_reward": 0.8597426056861878,
"rewards/confidence_uniqueness_reward": 0.8913098454475403,
"rewards/format_reward": 0.9918402791023254,
"rewards/frontier_aurc_reward": -0.0009640321717597544,
"rewards/frontier_ece_reward": 0.01977999582886696,
"rewards/frontier_entropy_batch_reward": -0.5613025665283203,
"signal/accuracy_reward/centered_abs_mean": 0.15625000298023223,
"signal/accuracy_reward/group_std_mean": 0.20937940776348113,
"signal/accuracy_reward/group_zero_std_frac": 0.38333333730697633,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07812500149011611,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07812500149011611,
"signal/advantage_abs_mean": 0.14403506219387055,
"signal/advantage_pre_scale_abs_mean": 0.14403506219387055,
"signal/advantage_pre_scale_std": 0.23285807073116302,
"signal/advantage_std": 0.23285807073116302,
"signal/batch_coverage_0/centered_abs_mean": 0.13533948957920075,
"signal/batch_coverage_0/group_std_mean": 0.18287818133831024,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.013533948920667171,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013533948920667171,
"signal/batch_coverage_1/centered_abs_mean": 0.13533948957920075,
"signal/batch_coverage_1/group_std_mean": 0.18287818133831024,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.013533948920667171,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.013533948920667171,
"signal/batch_coverage_10/centered_abs_mean": 0.16238720417022706,
"signal/batch_coverage_10/group_std_mean": 0.22439574897289277,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.0162387203425169,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.0162387203425169,
"signal/batch_coverage_15/centered_abs_mean": 0.16828042268753052,
"signal/batch_coverage_15/group_std_mean": 0.23210206627845764,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.016828041709959506,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.016828041709959506,
"signal/batch_coverage_20/centered_abs_mean": 0.17479702234268188,
"signal/batch_coverage_20/group_std_mean": 0.24096376001834868,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.017479703202843665,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.017479703202843665,
"signal/batch_coverage_25/centered_abs_mean": 0.17699779570102692,
"signal/batch_coverage_25/group_std_mean": 0.24334222674369813,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.01769977994263172,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.01769977994263172,
"signal/batch_coverage_5/centered_abs_mean": 0.14455084204673768,
"signal/batch_coverage_5/group_std_mean": 0.19790201783180236,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01445508487522602,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01445508487522602,
"signal/brier_reward/centered_abs_mean": 0.12116786390542984,
"signal/brier_reward/group_std_mean": 0.16832829415798187,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01211678646504879,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01211678646504879,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.05779082030057907,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07999642044305802,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.005779081955552101,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005779081955552101,
"signal/format_reward/centered_abs_mean": 0.014279513992369175,
"signal/format_reward/group_std_mean": 0.029366502165794374,
"signal/format_reward/group_zero_std_frac": 0.8694444537162781,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007139756996184588,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007139756996184588,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0006708215922117233,
"signal/frontier_aurc_reward/group_std_mean": 0.0010480164433829486,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 8.385269848076859e-06,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 8.385269848076859e-06,
"signal/frontier_ece_reward/centered_abs_mean": 0.021684225276112558,
"signal/frontier_ece_reward/group_std_mean": 0.03304144144058228,
"signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.00216842251829803,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.00216842251829803,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.311614853143692,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3836499214172363,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0861111119389534,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.031161487475037573,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.031161487475037573,
"step": 135
},
{
"calibration/aurc": 0.13130987388401666,
"calibration/batch_distribution_entropy": 0.6597482867683313,
"calibration/buffer_distribution_entropy": 0.6702807202345886,
"calibration/confidence_entropy": 0.25772347328549483,
"calibration/coverage@0%": 0.07819131853785902,
"calibration/coverage@1%": 0.18912465187119235,
"calibration/coverage@10%": 0.46348298520452574,
"calibration/coverage@15%": 0.5631871518711924,
"calibration/coverage@20%": 0.738234268929504,
"calibration/coverage@25%": 0.8244881418624891,
"calibration/coverage@30%": 0.8930138054830288,
"calibration/coverage@5%": 0.366953818537859,
"calibration/ece": 0.07870854715898996,
"calibration/mean_confidence": 0.6800026414696745,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0073784722222222324,
"completions/max_length": 3265.2,
"completions/max_terminated_length": 3265.2,
"completions/mean_length": 938.0556518554688,
"completions/mean_terminated_length": 945.0173217773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 318.8,
"epoch": 0.33599580005249935,
"grad_norm": 0.0005809567519463599,
"learning_rate": 2.0481927710843377e-06,
"loss": -0.0076,
"num_tokens": 331448924.0,
"reward": 1.2944050073623656,
"reward_std": 0.19232451021671296,
"rewards/accuracy_reward": 0.6342881917953491,
"rewards/batch_coverage_0": 0.42060819268226624,
"rewards/batch_coverage_1": 0.42060819268226624,
"rewards/batch_coverage_10": 0.5566146194934845,
"rewards/batch_coverage_15": 0.5694395661354065,
"rewards/batch_coverage_20": 0.578026282787323,
"rewards/batch_coverage_25": 0.5797327399253845,
"rewards/batch_coverage_5": 0.5075171232223511,
"rewards/brier_reward": 0.8636380910873414,
"rewards/confidence_uniqueness_reward": 0.8904500603675842,
"rewards/format_reward": 0.9926215291023255,
"rewards/frontier_aurc_reward": -0.000849519798066467,
"rewards/frontier_ece_reward": 0.018118308484554292,
"rewards/frontier_entropy_batch_reward": -0.5951453566551208,
"signal/accuracy_reward/centered_abs_mean": 0.1548122853040695,
"signal/accuracy_reward/group_std_mean": 0.2058233290910721,
"signal/accuracy_reward/group_zero_std_frac": 0.4083333432674408,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07740614265203476,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07740614265203476,
"signal/advantage_abs_mean": 0.1391018182039261,
"signal/advantage_pre_scale_abs_mean": 0.1391018182039261,
"signal/advantage_pre_scale_std": 0.22803472578525544,
"signal/advantage_std": 0.22803472578525544,
"signal/batch_coverage_0/centered_abs_mean": 0.13350388407707214,
"signal/batch_coverage_0/group_std_mean": 0.18149447143077851,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.013350388407707215,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.013350388407707215,
"signal/batch_coverage_1/centered_abs_mean": 0.13350388407707214,
"signal/batch_coverage_1/group_std_mean": 0.18149447143077851,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.013350388407707215,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.013350388407707215,
"signal/batch_coverage_10/centered_abs_mean": 0.16032011508941652,
"signal/batch_coverage_10/group_std_mean": 0.2208678036928177,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.016032011434435844,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.016032011434435844,
"signal/batch_coverage_15/centered_abs_mean": 0.16713197529315948,
"signal/batch_coverage_15/group_std_mean": 0.22929813265800475,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.016713198088109494,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.016713198088109494,
"signal/batch_coverage_20/centered_abs_mean": 0.17368297278881073,
"signal/batch_coverage_20/group_std_mean": 0.2377300262451172,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.017368298023939133,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.017368298023939133,
"signal/batch_coverage_25/centered_abs_mean": 0.17488843202590942,
"signal/batch_coverage_25/group_std_mean": 0.23914681673049926,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.017488843947649,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.017488843947649,
"signal/batch_coverage_5/centered_abs_mean": 0.14723534882068634,
"signal/batch_coverage_5/group_std_mean": 0.20246534347534179,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01472353506833315,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01472353506833315,
"signal/brier_reward/centered_abs_mean": 0.12155497521162033,
"signal/brier_reward/group_std_mean": 0.16532252728939056,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.012155497632920741,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.012155497632920741,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.05778259187936783,
"signal/confidence_uniqueness_reward/group_std_mean": 0.07849341556429863,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0057782595045864586,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0057782595045864586,
"signal/format_reward/centered_abs_mean": 0.012679036427289247,
"signal/format_reward/group_std_mean": 0.026040823385119437,
"signal/format_reward/group_zero_std_frac": 0.8833333492279053,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.006339518213644623,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006339518213644623,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0005655112327076495,
"signal/frontier_aurc_reward/group_std_mean": 0.000877034547738731,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 7.0688900450477375e-06,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 7.0688900450477375e-06,
"signal/frontier_ece_reward/centered_abs_mean": 0.02037911266088486,
"signal/frontier_ece_reward/group_std_mean": 0.030708856508135797,
"signal/frontier_ece_reward/group_zero_std_frac": 0.002777777798473835,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0020379112334921955,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0020379112334921955,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.30631880164146424,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.38140068054199217,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.09444444552063942,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.030631881207227707,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.030631881207227707,
"step": 140
},
{
"calibration/aurc": 0.13242803980464785,
"calibration/batch_distribution_entropy": 0.6897934814373954,
"calibration/buffer_distribution_entropy": 0.6836537797572135,
"calibration/confidence_entropy": 0.2472499154735496,
"calibration/coverage@0%": 0.06841974896222075,
"calibration/coverage@1%": 0.08460773851835651,
"calibration/coverage@10%": 0.5285086137957431,
"calibration/coverage@15%": 0.6229715886566511,
"calibration/coverage@20%": 0.7273929512460187,
"calibration/coverage@25%": 0.7921071247932414,
"calibration/coverage@30%": 0.8703833987783595,
"calibration/coverage@5%": 0.38859692328635415,
"calibration/ece": 0.1295649301313624,
"calibration/mean_confidence": 0.6769795923957921,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006423611111111116,
"completions/max_length": 3228.4,
"completions/max_terminated_length": 3228.4,
"completions/mean_length": 909.677001953125,
"completions/mean_terminated_length": 915.5734008789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 291.8,
"epoch": 0.34799565005437433,
"grad_norm": 0.0005263423081487417,
"learning_rate": 1.8975903614457832e-06,
"loss": -0.0073,
"num_tokens": 344993011.0,
"reward": 1.3420505285263062,
"reward_std": 0.17107113003730773,
"rewards/accuracy_reward": 0.6890625,
"rewards/batch_coverage_0": 0.4715816080570221,
"rewards/batch_coverage_1": 0.4715816080570221,
"rewards/batch_coverage_10": 0.5932251930236816,
"rewards/batch_coverage_15": 0.597273302078247,
"rewards/batch_coverage_20": 0.6058772444725037,
"rewards/batch_coverage_25": 0.6105739712715149,
"rewards/batch_coverage_5": 0.5536993384361267,
"rewards/brier_reward": 0.8571294665336608,
"rewards/confidence_uniqueness_reward": 0.840834093093872,
"rewards/format_reward": 0.9935763716697693,
"rewards/frontier_aurc_reward": -0.000966754974797368,
"rewards/frontier_ece_reward": 0.016152642853558063,
"rewards/frontier_entropy_batch_reward": -0.6104970335960388,
"signal/accuracy_reward/centered_abs_mean": 0.14091796725988387,
"signal/accuracy_reward/group_std_mean": 0.18586435317993164,
"signal/accuracy_reward/group_zero_std_frac": 0.4694444477558136,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07045898362994193,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07045898362994193,
"signal/advantage_abs_mean": 0.1214504137635231,
"signal/advantage_pre_scale_abs_mean": 0.1214504137635231,
"signal/advantage_pre_scale_std": 0.21154877543449402,
"signal/advantage_std": 0.21154877543449402,
"signal/batch_coverage_0/centered_abs_mean": 0.12109262049198151,
"signal/batch_coverage_0/group_std_mean": 0.16618651747703553,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012109261937439442,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012109261937439442,
"signal/batch_coverage_1/centered_abs_mean": 0.12109262049198151,
"signal/batch_coverage_1/group_std_mean": 0.16618651747703553,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012109261937439442,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012109261937439442,
"signal/batch_coverage_10/centered_abs_mean": 0.14189857691526414,
"signal/batch_coverage_10/group_std_mean": 0.20135764181613922,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.014189857989549637,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.014189857989549637,
"signal/batch_coverage_15/centered_abs_mean": 0.1426687866449356,
"signal/batch_coverage_15/group_std_mean": 0.2024532824754715,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.014266878738999367,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.014266878738999367,
"signal/batch_coverage_20/centered_abs_mean": 0.14260076880455017,
"signal/batch_coverage_20/group_std_mean": 0.20335802137851716,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.01426007729023695,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.01426007729023695,
"signal/batch_coverage_25/centered_abs_mean": 0.1483145087957382,
"signal/batch_coverage_25/group_std_mean": 0.2090164303779602,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.014831452071666718,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.014831452071666718,
"signal/batch_coverage_5/centered_abs_mean": 0.13110972493886947,
"signal/batch_coverage_5/group_std_mean": 0.1835319548845291,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01311097275465727,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01311097275465727,
"signal/brier_reward/centered_abs_mean": 0.11349970698356629,
"signal/brier_reward/group_std_mean": 0.15323745906352998,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011349971219897271,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011349971219897271,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08850041627883912,
"signal/confidence_uniqueness_reward/group_std_mean": 0.11195365190505982,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008850041963160037,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008850041963160037,
"signal/format_reward/centered_abs_mean": 0.011295572854578495,
"signal/format_reward/group_std_mean": 0.023191133886575697,
"signal/format_reward/group_zero_std_frac": 0.8972222328186035,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0056477864272892475,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0056477864272892475,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0007335938105825334,
"signal/frontier_aurc_reward/group_std_mean": 0.0011282574851065874,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 9.169922486762517e-06,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 9.169922486762517e-06,
"signal/frontier_ece_reward/centered_abs_mean": 0.015803563967347146,
"signal/frontier_ece_reward/group_std_mean": 0.02349414937198162,
"signal/frontier_ece_reward/group_zero_std_frac": 0.002777777798473835,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0015803564339876175,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0015803564339876175,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.263643753528595,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.33980435132980347,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.1500000014901161,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.026364374533295633,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.026364374533295633,
"step": 145
},
{
"calibration/aurc": 0.1312147602821559,
"calibration/batch_distribution_entropy": 0.661729676250131,
"calibration/buffer_distribution_entropy": 0.6944097591767437,
"calibration/confidence_entropy": 0.23713249978777262,
"calibration/coverage@0%": 0.04393264545314922,
"calibration/coverage@1%": 0.16382511857142878,
"calibration/coverage@10%": 0.47878265415336674,
"calibration/coverage@15%": 0.674541930220249,
"calibration/coverage@20%": 0.7587017934939817,
"calibration/coverage@25%": 0.816094054136481,
"calibration/coverage@30%": 0.891664031966991,
"calibration/coverage@5%": 0.22227004964753982,
"calibration/ece": 0.11323309080521082,
"calibration/mean_confidence": 0.7025271439706133,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006423611111111116,
"completions/max_length": 3744.0,
"completions/max_terminated_length": 3744.0,
"completions/mean_length": 1006.754345703125,
"completions/mean_terminated_length": 1013.2921508789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 297.4,
"epoch": 0.3599955000562493,
"grad_norm": 0.0005483371787704527,
"learning_rate": 1.7469879518072292e-06,
"loss": -0.0063,
"num_tokens": 359701157.0,
"reward": 1.3184950351715088,
"reward_std": 0.18533724546432495,
"rewards/accuracy_reward": 0.6743923664093018,
"rewards/batch_coverage_0": 0.4469079613685608,
"rewards/batch_coverage_1": 0.4469079613685608,
"rewards/batch_coverage_10": 0.5693666577339173,
"rewards/batch_coverage_15": 0.5873542547225952,
"rewards/batch_coverage_20": 0.602312695980072,
"rewards/batch_coverage_25": 0.6088623762130737,
"rewards/batch_coverage_5": 0.5222943484783172,
"rewards/brier_reward": 0.852564013004303,
"rewards/confidence_uniqueness_reward": 0.8201781153678894,
"rewards/format_reward": 0.9934027791023254,
"rewards/frontier_aurc_reward": -0.0010987881105393171,
"rewards/frontier_ece_reward": 0.013082112185657024,
"rewards/frontier_entropy_batch_reward": -0.6237188339233398,
"signal/accuracy_reward/centered_abs_mean": 0.16347113847732545,
"signal/accuracy_reward/group_std_mean": 0.21139195263385774,
"signal/accuracy_reward/group_zero_std_frac": 0.41666666865348817,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08173556923866272,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.08173556923866272,
"signal/advantage_abs_mean": 0.13506793975830078,
"signal/advantage_pre_scale_abs_mean": 0.13506793975830078,
"signal/advantage_pre_scale_std": 0.22540478706359862,
"signal/advantage_std": 0.22540478706359862,
"signal/batch_coverage_0/centered_abs_mean": 0.12697066515684127,
"signal/batch_coverage_0/group_std_mean": 0.17325193881988527,
"signal/batch_coverage_0/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012697066552937031,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012697066552937031,
"signal/batch_coverage_1/centered_abs_mean": 0.12697066515684127,
"signal/batch_coverage_1/group_std_mean": 0.17325193881988527,
"signal/batch_coverage_1/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012697066552937031,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012697066552937031,
"signal/batch_coverage_10/centered_abs_mean": 0.148236745595932,
"signal/batch_coverage_10/group_std_mean": 0.2072461098432541,
"signal/batch_coverage_10/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.014823674410581588,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.014823674410581588,
"signal/batch_coverage_15/centered_abs_mean": 0.15683491826057433,
"signal/batch_coverage_15/group_std_mean": 0.21849296391010284,
"signal/batch_coverage_15/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.015683492459356786,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.015683492459356786,
"signal/batch_coverage_20/centered_abs_mean": 0.16781437695026397,
"signal/batch_coverage_20/group_std_mean": 0.2322419822216034,
"signal/batch_coverage_20/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.016781437583267687,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.016781437583267687,
"signal/batch_coverage_25/centered_abs_mean": 0.17468718588352203,
"signal/batch_coverage_25/group_std_mean": 0.23929275274276735,
"signal/batch_coverage_25/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.017468718439340593,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.017468718439340593,
"signal/batch_coverage_5/centered_abs_mean": 0.1365981876850128,
"signal/batch_coverage_5/group_std_mean": 0.19019657075405122,
"signal/batch_coverage_5/group_zero_std_frac": 0.00555555559694767,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.013659819029271603,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.013659819029271603,
"signal/brier_reward/centered_abs_mean": 0.12594723403453828,
"signal/brier_reward/group_std_mean": 0.16902333796024321,
"signal/brier_reward/group_zero_std_frac": 0.00555555559694767,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01259472370147705,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01259472370147705,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08781559914350509,
"signal/confidence_uniqueness_reward/group_std_mean": 0.11438610553741455,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.00555555559694767,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.00878155967220664,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.00878155967220664,
"signal/format_reward/centered_abs_mean": 0.011816406156867743,
"signal/format_reward/group_std_mean": 0.026017525047063828,
"signal/format_reward/group_zero_std_frac": 0.8777777791023255,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.005908203078433871,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.005908203078433871,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0011653327848762274,
"signal/frontier_aurc_reward/group_std_mean": 0.0018219074700027704,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.4566658683179412e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.4566658683179412e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.011827461235225201,
"signal/frontier_ece_reward/group_std_mean": 0.017636168748140335,
"signal/frontier_ece_reward/group_zero_std_frac": 0.011111111380159855,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0011827461421489716,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0011827461421489716,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.266205421090126,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.34265450239181516,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.1638888895511627,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.026620543748140334,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.026620543748140334,
"step": 150
},
{
"epoch": 0.3599955000562493,
"eval_calibration/aurc": 0.11871445559673784,
"eval_calibration/batch_distribution_entropy": 0.579269640153368,
"eval_calibration/buffer_distribution_entropy": 0.6991974471728701,
"eval_calibration/confidence_entropy": 0.20986329461099262,
"eval_calibration/coverage@0%": 0.25235215053763443,
"eval_calibration/coverage@1%": 0.25235215053763443,
"eval_calibration/coverage@10%": 0.597614247311828,
"eval_calibration/coverage@15%": 0.6656586021505376,
"eval_calibration/coverage@20%": 0.8015793010752689,
"eval_calibration/coverage@25%": 0.8592069892473119,
"eval_calibration/coverage@30%": 0.9583333333333334,
"eval_calibration/coverage@5%": 0.29922715053763443,
"eval_calibration/ece": 0.1486032851244478,
"eval_calibration/mean_confidence": 0.693362338484394,
"eval_completions/clipped_ratio": 0.006076388888888895,
"eval_completions/max_length": 2499.5,
"eval_completions/max_terminated_length": 2499.5,
"eval_completions/mean_length": 950.9599100748698,
"eval_completions/mean_terminated_length": 956.7284545898438,
"eval_completions/min_length": 136.16666666666666,
"eval_completions/min_terminated_length": 369.0,
"eval_loss": 0.0,
"eval_num_tokens": 359701157.0,
"eval_reward": 1.0300345222155254,
"eval_reward_std": 0.2921375830968221,
"eval_rewards/accuracy_reward": 0.6527777711550394,
"eval_rewards/batch_coverage_0": 0.13550970455010733,
"eval_rewards/batch_coverage_1": 0.13550970455010733,
"eval_rewards/batch_coverage_10": 0.17404532556732497,
"eval_rewards/batch_coverage_15": 0.21777691567937532,
"eval_rewards/batch_coverage_20": 0.2769256259004275,
"eval_rewards/batch_coverage_25": 0.338382991651694,
"eval_rewards/batch_coverage_5": 0.13637679815292358,
"eval_rewards/brier_reward": 0.8511766990025839,
"eval_rewards/confidence_uniqueness_reward": 0.7835536301136017,
"eval_rewards/format_reward": 0.9939236144224802,
"eval_rewards/frontier_aurc_reward": -0.0016888692043721676,
"eval_rewards/frontier_ece_reward": 0.011714956567933163,
"eval_rewards/frontier_entropy_batch_reward": -0.9939236144224802,
"eval_runtime": 179.13,
"eval_samples_per_second": 5.583,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4405381977558136,
"eval_signal/accuracy_reward/group_std_mean": 0.4760441432396571,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.2202690988779068,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2202690988779068,
"eval_signal/advantage_abs_mean": 0.23840626577536264,
"eval_signal/advantage_pre_scale_abs_mean": 0.23840626577536264,
"eval_signal/advantage_pre_scale_std": 0.29046809176603955,
"eval_signal/advantage_std": 0.29046809176603955,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.28725582361221313,
"eval_signal/batch_coverage_0/group_std_mean": 0.39222339789072674,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.028725583106279373,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.028725583106279373,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.28725582361221313,
"eval_signal/batch_coverage_1/group_std_mean": 0.39222339789072674,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.028725583106279373,
"eval_signal/batch_coverage_1/weight": 0.10000000149011612,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.028725583106279373,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.23628024011850357,
"eval_signal/batch_coverage_10/group_std_mean": 0.30995993067820865,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.02362802407393853,
"eval_signal/batch_coverage_10/weight": 0.10000000149011612,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.02362802407393853,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.249565156797568,
"eval_signal/batch_coverage_15/group_std_mean": 0.32002828270196915,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.024956516921520233,
"eval_signal/batch_coverage_15/weight": 0.10000000149011612,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.024956516921520233,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.2884892448782921,
"eval_signal/batch_coverage_20/group_std_mean": 0.3490825891494751,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.028848926226298015,
"eval_signal/batch_coverage_20/weight": 0.10000000149011612,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.028848926226298015,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.36286551256974536,
"eval_signal/batch_coverage_25/group_std_mean": 0.42505596081415814,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.036286553367972374,
"eval_signal/batch_coverage_25/weight": 0.10000000149011612,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.036286553367972374,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.27063219497601193,
"eval_signal/batch_coverage_5/group_std_mean": 0.36894193291664124,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.027063219187160332,
"eval_signal/batch_coverage_5/weight": 0.10000000149011612,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.027063219187160332,
"eval_signal/brier_reward/centered_abs_mean": 0.21436868607997894,
"eval_signal/brier_reward/group_std_mean": 0.2930321345726649,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.021436869477232296,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.021436869477232296,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.14700014889240265,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.17157389223575592,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.014700015230725208,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.014700015230725208,
"eval_signal/format_reward/centered_abs_mean": 0.011664496424297491,
"eval_signal/format_reward/group_std_mean": 0.031383837262789406,
"eval_signal/format_reward/group_zero_std_frac": 0.8333333532015482,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0058322482121487456,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.0058322482121487456,
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.002837341142973552,
"eval_signal/frontier_aurc_reward/group_std_mean": 0.006045101191072415,
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 3.5466764378118874e-05,
"eval_signal/frontier_aurc_reward/weight": 0.012500000186264515,
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 3.5466764378118874e-05,
"eval_signal/frontier_ece_reward/centered_abs_mean": 0.013863155307869116,
"eval_signal/frontier_ece_reward/group_std_mean": 0.020291317875186603,
"eval_signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.001386315542428444,
"eval_signal/frontier_ece_reward/weight": 0.10000000149011612,
"eval_signal/frontier_ece_reward/weighted_centered_abs_mean": 0.001386315542428444,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.011664496424297491,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.031383837262789406,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.8333333532015482,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.0011664496657128136,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0011664496657128136,
"eval_steps_per_second": 0.033,
"step": 150
},
{
"calibration/aurc": 0.12383752382904105,
"calibration/batch_distribution_entropy": 0.702563636124068,
"calibration/buffer_distribution_entropy": 0.6998238675351348,
"calibration/confidence_entropy": 0.2524229859327377,
"calibration/coverage@0%": 0.05863069335654192,
"calibration/coverage@1%": 0.1390484479257325,
"calibration/coverage@10%": 0.5521362222291111,
"calibration/coverage@15%": 0.7136257636058139,
"calibration/coverage@20%": 0.793529087825562,
"calibration/coverage@25%": 0.8577884487501664,
"calibration/coverage@30%": 0.9075186022581707,
"calibration/coverage@5%": 0.3468908052343468,
"calibration/ece": 0.14021003480180688,
"calibration/mean_confidence": 0.7065864211069093,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005989583333333326,
"completions/max_length": 3560.4,
"completions/max_terminated_length": 3560.4,
"completions/mean_length": 945.785693359375,
"completions/mean_terminated_length": 951.551171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 310.2,
"epoch": 0.3719953500581243,
"grad_norm": 0.0005548963672481477,
"learning_rate": 1.5963855421686747e-06,
"loss": -0.0039,
"num_tokens": 373704320.0,
"reward": 1.360306739807129,
"reward_std": 0.17673680186271667,
"rewards/accuracy_reward": 0.71875,
"rewards/batch_coverage_0": 0.46235992312431334,
"rewards/batch_coverage_1": 0.46235992312431334,
"rewards/batch_coverage_10": 0.6080305695533752,
"rewards/batch_coverage_15": 0.6152806878089905,
"rewards/batch_coverage_20": 0.6200535535812378,
"rewards/batch_coverage_25": 0.6224696636199951,
"rewards/batch_coverage_5": 0.5473322987556457,
"rewards/brier_reward": 0.8666894555091857,
"rewards/confidence_uniqueness_reward": 0.8023838877677918,
"rewards/format_reward": 0.9940104126930237,
"rewards/frontier_aurc_reward": -0.0010168740176595748,
"rewards/frontier_ece_reward": 0.01128138080239296,
"rewards/frontier_entropy_batch_reward": -0.5788492798805237,
"signal/accuracy_reward/centered_abs_mean": 0.15622830092906953,
"signal/accuracy_reward/group_std_mean": 0.20322224497795105,
"signal/accuracy_reward/group_zero_std_frac": 0.43055556416511537,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07811415046453477,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07811415046453477,
"signal/advantage_abs_mean": 0.12816403806209564,
"signal/advantage_pre_scale_abs_mean": 0.12816403806209564,
"signal/advantage_pre_scale_std": 0.21938224732875825,
"signal/advantage_std": 0.21938224732875825,
"signal/batch_coverage_0/centered_abs_mean": 0.12863150537014006,
"signal/batch_coverage_0/group_std_mean": 0.17149013876914979,
"signal/batch_coverage_0/group_zero_std_frac": 0.02222222238779068,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012863150611519814,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012863150611519814,
"signal/batch_coverage_1/centered_abs_mean": 0.12863150537014006,
"signal/batch_coverage_1/group_std_mean": 0.17149013876914979,
"signal/batch_coverage_1/group_zero_std_frac": 0.02222222238779068,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012863150611519814,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012863150611519814,
"signal/batch_coverage_10/centered_abs_mean": 0.15439578890800476,
"signal/batch_coverage_10/group_std_mean": 0.21234571635723115,
"signal/batch_coverage_10/group_zero_std_frac": 0.02222222238779068,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.015439579635858536,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.015439579635858536,
"signal/batch_coverage_15/centered_abs_mean": 0.15866729915142058,
"signal/batch_coverage_15/group_std_mean": 0.21745036244392396,
"signal/batch_coverage_15/group_zero_std_frac": 0.02222222238779068,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01586672980338335,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01586672980338335,
"signal/batch_coverage_20/centered_abs_mean": 0.16100625395774842,
"signal/batch_coverage_20/group_std_mean": 0.2214736223220825,
"signal/batch_coverage_20/group_zero_std_frac": 0.02222222238779068,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.016100625693798064,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.016100625693798064,
"signal/batch_coverage_25/centered_abs_mean": 0.16343387961387634,
"signal/batch_coverage_25/group_std_mean": 0.22452974319458008,
"signal/batch_coverage_25/group_zero_std_frac": 0.02222222238779068,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.016343388892710208,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.016343388892710208,
"signal/batch_coverage_5/centered_abs_mean": 0.13978493511676787,
"signal/batch_coverage_5/group_std_mean": 0.18872489631175995,
"signal/batch_coverage_5/group_zero_std_frac": 0.02222222238779068,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.013978493958711624,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.013978493958711624,
"signal/brier_reward/centered_abs_mean": 0.11147035211324692,
"signal/brier_reward/group_std_mean": 0.15009200870990752,
"signal/brier_reward/group_zero_std_frac": 0.02222222238779068,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011147035658359528,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011147035658359528,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08754237294197083,
"signal/confidence_uniqueness_reward/group_std_mean": 0.11724895834922791,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.02222222238779068,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.00875423727557063,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.00875423727557063,
"signal/format_reward/centered_abs_mean": 0.01034613698720932,
"signal/format_reward/group_std_mean": 0.021106501668691637,
"signal/format_reward/group_zero_std_frac": 0.9055555701255799,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00517306849360466,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00517306849360466,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0011056010029278696,
"signal/frontier_aurc_reward/group_std_mean": 0.0016772629460319877,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.3820012736687203e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.3820012736687203e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.009840690344572068,
"signal/frontier_ece_reward/group_std_mean": 0.01416495107114315,
"signal/frontier_ece_reward/group_zero_std_frac": 0.04722222331911326,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0009840689948759974,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0009840689948759974,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.27197187542915346,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3398334622383118,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.1861111119389534,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.027197187766432762,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.027197187766432762,
"step": 155
},
{
"calibration/aurc": 0.10045860708990102,
"calibration/batch_distribution_entropy": 0.5876831620875574,
"calibration/buffer_distribution_entropy": 0.6983014411007992,
"calibration/confidence_entropy": 0.20393004988738403,
"calibration/coverage@0%": 0.13159268929503914,
"calibration/coverage@1%": 0.1404699738903394,
"calibration/coverage@10%": 0.6726799185487276,
"calibration/coverage@15%": 0.7299267652580486,
"calibration/coverage@20%": 0.8663485397658626,
"calibration/coverage@25%": 0.8970835978835978,
"calibration/coverage@30%": 0.9098666666666666,
"calibration/coverage@5%": 0.5083079297763791,
"calibration/ece": 0.09946244024236016,
"calibration/mean_confidence": 0.7541408872262508,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007725694444444442,
"completions/max_length": 3137.6,
"completions/max_terminated_length": 3137.6,
"completions/mean_length": 969.1362915039062,
"completions/mean_terminated_length": 976.686181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 310.8,
"epoch": 0.38399520005999926,
"grad_norm": 0.0004891646676696837,
"learning_rate": 1.4457831325301204e-06,
"loss": -0.0087,
"num_tokens": 387956066.0,
"reward": 1.3025524616241455,
"reward_std": 0.18135660588741304,
"rewards/accuracy_reward": 0.6518229126930237,
"rewards/batch_coverage_0": 0.4337134540081024,
"rewards/batch_coverage_1": 0.4337134540081024,
"rewards/batch_coverage_10": 0.5615343689918518,
"rewards/batch_coverage_15": 0.5716203927993775,
"rewards/batch_coverage_20": 0.5894143342971802,
"rewards/batch_coverage_25": 0.5949154138565064,
"rewards/batch_coverage_5": 0.5151754200458527,
"rewards/brier_reward": 0.8427359223365783,
"rewards/confidence_uniqueness_reward": 0.8198214292526245,
"rewards/format_reward": 0.9922742962837219,
"rewards/frontier_aurc_reward": -0.0014497652417048811,
"rewards/frontier_ece_reward": 0.009934188798069954,
"rewards/frontier_entropy_batch_reward": -0.5673593401908874,
"signal/accuracy_reward/centered_abs_mean": 0.14571940302848815,
"signal/accuracy_reward/group_std_mean": 0.1967881828546524,
"signal/accuracy_reward/group_zero_std_frac": 0.42222222685813904,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07285970151424408,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07285970151424408,
"signal/advantage_abs_mean": 0.13016137778759002,
"signal/advantage_pre_scale_abs_mean": 0.13016137778759002,
"signal/advantage_pre_scale_std": 0.22107622325420379,
"signal/advantage_std": 0.22107622325420379,
"signal/batch_coverage_0/centered_abs_mean": 0.12322651296854019,
"signal/batch_coverage_0/group_std_mean": 0.16826023459434508,
"signal/batch_coverage_0/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012322651594877243,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012322651594877243,
"signal/batch_coverage_1/centered_abs_mean": 0.12322651296854019,
"signal/batch_coverage_1/group_std_mean": 0.16826023459434508,
"signal/batch_coverage_1/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012322651594877243,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012322651594877243,
"signal/batch_coverage_10/centered_abs_mean": 0.14654467403888702,
"signal/batch_coverage_10/group_std_mean": 0.20563188791275025,
"signal/batch_coverage_10/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01465446725487709,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01465446725487709,
"signal/batch_coverage_15/centered_abs_mean": 0.15097981691360474,
"signal/batch_coverage_15/group_std_mean": 0.2103422313928604,
"signal/batch_coverage_15/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.015097982622683049,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.015097982622683049,
"signal/batch_coverage_20/centered_abs_mean": 0.16249073147773743,
"signal/batch_coverage_20/group_std_mean": 0.22473800778388978,
"signal/batch_coverage_20/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.01624907273799181,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.01624907273799181,
"signal/batch_coverage_25/centered_abs_mean": 0.16827959716320037,
"signal/batch_coverage_25/group_std_mean": 0.23056098818778992,
"signal/batch_coverage_25/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.016827961057424547,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.016827961057424547,
"signal/batch_coverage_5/centered_abs_mean": 0.1344536691904068,
"signal/batch_coverage_5/group_std_mean": 0.18762777447700502,
"signal/batch_coverage_5/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.013445367850363255,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.013445367850363255,
"signal/brier_reward/centered_abs_mean": 0.11661726981401443,
"signal/brier_reward/group_std_mean": 0.15799384713172912,
"signal/brier_reward/group_zero_std_frac": 0.008333333395421505,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011661727912724018,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011661727912724018,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08469038307666779,
"signal/confidence_uniqueness_reward/group_std_mean": 0.1138753980398178,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.008333333395421505,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008469038363546132,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008469038363546132,
"signal/format_reward/centered_abs_mean": 0.014100477658212185,
"signal/format_reward/group_std_mean": 0.02932555302977562,
"signal/format_reward/group_zero_std_frac": 0.8722222328186036,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.007050238829106092,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.007050238829106092,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0013116570771671832,
"signal/frontier_aurc_reward/group_std_mean": 0.0020308221224695443,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.639571491978131e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.639571491978131e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.00958633776754141,
"signal/frontier_ece_reward/group_std_mean": 0.013803408481180668,
"signal/frontier_ece_reward/group_zero_std_frac": 0.022222222574055196,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0009586337953805923,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0009586337953805923,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.285816890001297,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.36413044333457945,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.10555555820465087,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.028581687808036806,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.028581687808036806,
"step": 160
},
{
"calibration/aurc": 0.10952826831792246,
"calibration/batch_distribution_entropy": 0.6720452611414822,
"calibration/buffer_distribution_entropy": 0.6942523445932581,
"calibration/confidence_entropy": 0.23859629732682824,
"calibration/coverage@0%": 0.0203125,
"calibration/coverage@1%": 0.11354166666666668,
"calibration/coverage@10%": 0.6200841238051255,
"calibration/coverage@15%": 0.7198256977121924,
"calibration/coverage@20%": 0.7748366290112996,
"calibration/coverage@25%": 0.8324017637657549,
"calibration/coverage@30%": 0.8856770056801239,
"calibration/coverage@5%": 0.5416919134296264,
"calibration/ece": 0.08986915787646825,
"calibration/mean_confidence": 0.6594262186067543,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0066840277777777905,
"completions/max_length": 3693.0,
"completions/max_terminated_length": 3693.0,
"completions/mean_length": 986.36953125,
"completions/mean_terminated_length": 993.1266357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.8,
"epoch": 0.39599505006187424,
"grad_norm": 0.0005287404055707157,
"learning_rate": 1.2951807228915664e-06,
"loss": -0.0053,
"num_tokens": 402458115.0,
"reward": 1.3131772518157958,
"reward_std": 0.17054618895053864,
"rewards/accuracy_reward": 0.6387152671813965,
"rewards/batch_coverage_0": 0.4660345435142517,
"rewards/batch_coverage_1": 0.4660345435142517,
"rewards/batch_coverage_10": 0.5774924635887146,
"rewards/batch_coverage_15": 0.5880944848060607,
"rewards/batch_coverage_20": 0.5928570389747619,
"rewards/batch_coverage_25": 0.6022311568260192,
"rewards/batch_coverage_5": 0.5361921072006226,
"rewards/brier_reward": 0.8557339429855346,
"rewards/confidence_uniqueness_reward": 0.8580495834350585,
"rewards/format_reward": 0.9933159828186036,
"rewards/frontier_aurc_reward": -0.0011106105986982584,
"rewards/frontier_ece_reward": 0.009472636692225932,
"rewards/frontier_entropy_batch_reward": -0.5804377794265747,
"signal/accuracy_reward/centered_abs_mean": 0.13853081464767455,
"signal/accuracy_reward/group_std_mean": 0.1859588861465454,
"signal/accuracy_reward/group_zero_std_frac": 0.4583333432674408,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06926540732383728,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06926540732383728,
"signal/advantage_abs_mean": 0.12401239275932312,
"signal/advantage_pre_scale_abs_mean": 0.12401239275932312,
"signal/advantage_pre_scale_std": 0.20768028497695923,
"signal/advantage_std": 0.20768028497695923,
"signal/batch_coverage_0/centered_abs_mean": 0.12554647028446198,
"signal/batch_coverage_0/group_std_mean": 0.17078601717948913,
"signal/batch_coverage_0/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012554646842181683,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012554646842181683,
"signal/batch_coverage_1/centered_abs_mean": 0.12554647028446198,
"signal/batch_coverage_1/group_std_mean": 0.17078601717948913,
"signal/batch_coverage_1/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012554646842181683,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012554646842181683,
"signal/batch_coverage_10/centered_abs_mean": 0.1449828863143921,
"signal/batch_coverage_10/group_std_mean": 0.2013940066099167,
"signal/batch_coverage_10/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.014498288556933404,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.014498288556933404,
"signal/batch_coverage_15/centered_abs_mean": 0.14958977848291397,
"signal/batch_coverage_15/group_std_mean": 0.20786149501800538,
"signal/batch_coverage_15/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01495897825807333,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01495897825807333,
"signal/batch_coverage_20/centered_abs_mean": 0.14767719060182571,
"signal/batch_coverage_20/group_std_mean": 0.20538305938243867,
"signal/batch_coverage_20/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.014767719991505146,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.014767719991505146,
"signal/batch_coverage_25/centered_abs_mean": 0.15909694731235505,
"signal/batch_coverage_25/group_std_mean": 0.218677294254303,
"signal/batch_coverage_25/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015909695252776145,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015909695252776145,
"signal/batch_coverage_5/centered_abs_mean": 0.1341264843940735,
"signal/batch_coverage_5/group_std_mean": 0.18379494845867156,
"signal/batch_coverage_5/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.013412648998200893,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.013412648998200893,
"signal/brier_reward/centered_abs_mean": 0.11033482253551483,
"signal/brier_reward/group_std_mean": 0.14876342713832855,
"signal/brier_reward/group_zero_std_frac": 0.002777777798473835,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011033482663333417,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011033482663333417,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.0680958390235901,
"signal/confidence_uniqueness_reward/group_std_mean": 0.08948377221822738,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.00555555559694767,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.006809584051370621,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006809584051370621,
"signal/format_reward/centered_abs_mean": 0.00956488698720932,
"signal/format_reward/group_std_mean": 0.019422347657382488,
"signal/format_reward/group_zero_std_frac": 0.9138888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00478244349360466,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00478244349360466,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0009592408197931946,
"signal/frontier_aurc_reward/group_std_mean": 0.0014806427410803736,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.1990510483883553e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.1990510483883553e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.009095791541039944,
"signal/frontier_ece_reward/group_std_mean": 0.012939628027379513,
"signal/frontier_ece_reward/group_zero_std_frac": 0.019444444589316844,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0009095791378058493,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0009095791378058493,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.2910008132457733,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3624585449695587,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.1416666701436043,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.02910008132457733,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.02910008132457733,
"step": 165
},
{
"calibration/aurc": 0.1035497171259897,
"calibration/batch_distribution_entropy": 0.6194467482238635,
"calibration/buffer_distribution_entropy": 0.6964728802094691,
"calibration/confidence_entropy": 0.2095385484232973,
"calibration/coverage@0%": 0.12944753490401398,
"calibration/coverage@1%": 0.14463078097731238,
"calibration/coverage@10%": 0.6789776081184496,
"calibration/coverage@15%": 0.7661971044367294,
"calibration/coverage@20%": 0.843384525692714,
"calibration/coverage@25%": 0.8931907133391196,
"calibration/coverage@30%": 0.9410449389179757,
"calibration/coverage@5%": 0.3630785060335414,
"calibration/ece": 0.1010162488416066,
"calibration/mean_confidence": 0.7447559374268262,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006770833333333348,
"completions/max_length": 3683.4,
"completions/max_terminated_length": 3683.4,
"completions/mean_length": 968.5309936523438,
"completions/mean_terminated_length": 975.1132934570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 331.2,
"epoch": 0.4079949000637492,
"grad_norm": 0.0006581774796359241,
"learning_rate": 1.1445783132530121e-06,
"loss": -0.0073,
"num_tokens": 416704776.0,
"reward": 1.3400552988052368,
"reward_std": 0.17083775103092194,
"rewards/accuracy_reward": 0.6823784708976746,
"rewards/batch_coverage_0": 0.4667522668838501,
"rewards/batch_coverage_1": 0.4667522668838501,
"rewards/batch_coverage_10": 0.5824069619178772,
"rewards/batch_coverage_15": 0.5999761819839478,
"rewards/batch_coverage_20": 0.6159539341926574,
"rewards/batch_coverage_25": 0.6178914666175842,
"rewards/batch_coverage_5": 0.538118553161621,
"rewards/brier_reward": 0.873453962802887,
"rewards/confidence_uniqueness_reward": 0.8486914992332458,
"rewards/format_reward": 0.9931423664093018,
"rewards/frontier_aurc_reward": -0.0007993413018994033,
"rewards/frontier_ece_reward": 0.00918477177619934,
"rewards/frontier_entropy_batch_reward": -0.5961334109306335,
"signal/accuracy_reward/centered_abs_mean": 0.1437554255127907,
"signal/accuracy_reward/group_std_mean": 0.1925100266933441,
"signal/accuracy_reward/group_zero_std_frac": 0.4388888955116272,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07187771275639535,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07187771275639535,
"signal/advantage_abs_mean": 0.1232904389500618,
"signal/advantage_pre_scale_abs_mean": 0.1232904389500618,
"signal/advantage_pre_scale_std": 0.2124456822872162,
"signal/advantage_std": 0.2124456822872162,
"signal/batch_coverage_0/centered_abs_mean": 0.11614251732826233,
"signal/batch_coverage_0/group_std_mean": 0.1566988781094551,
"signal/batch_coverage_0/group_zero_std_frac": 0.011111111380159855,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011614251881837845,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.011614251881837845,
"signal/batch_coverage_1/centered_abs_mean": 0.11614251732826233,
"signal/batch_coverage_1/group_std_mean": 0.1566988781094551,
"signal/batch_coverage_1/group_zero_std_frac": 0.011111111380159855,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011614251881837845,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.011614251881837845,
"signal/batch_coverage_10/centered_abs_mean": 0.13287641555070878,
"signal/batch_coverage_10/group_std_mean": 0.1858629435300827,
"signal/batch_coverage_10/group_zero_std_frac": 0.011111111380159855,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.013287641853094102,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.013287641853094102,
"signal/batch_coverage_15/centered_abs_mean": 0.1418210372328758,
"signal/batch_coverage_15/group_std_mean": 0.19721043407917022,
"signal/batch_coverage_15/group_zero_std_frac": 0.011111111380159855,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.014182103984057903,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.014182103984057903,
"signal/batch_coverage_20/centered_abs_mean": 0.15203088521957397,
"signal/batch_coverage_20/group_std_mean": 0.2103426307439804,
"signal/batch_coverage_20/group_zero_std_frac": 0.011111111380159855,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.015203088335692883,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.015203088335692883,
"signal/batch_coverage_25/centered_abs_mean": 0.15397260189056397,
"signal/batch_coverage_25/group_std_mean": 0.21220978796482087,
"signal/batch_coverage_25/group_zero_std_frac": 0.011111111380159855,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015397260524332523,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015397260524332523,
"signal/batch_coverage_5/centered_abs_mean": 0.12382150441408157,
"signal/batch_coverage_5/group_std_mean": 0.16996322572231293,
"signal/batch_coverage_5/group_zero_std_frac": 0.011111111380159855,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012382151931524277,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012382151931524277,
"signal/brier_reward/centered_abs_mean": 0.10963434725999832,
"signal/brier_reward/group_std_mean": 0.14850548803806304,
"signal/brier_reward/group_zero_std_frac": 0.011111111380159855,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.010963435098528863,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.010963435098528863,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.06850620657205582,
"signal/confidence_uniqueness_reward/group_std_mean": 0.08941082209348679,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.011111111380159855,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.006850621197372675,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006850621197372675,
"signal/format_reward/centered_abs_mean": 0.011572265438735485,
"signal/format_reward/group_std_mean": 0.022224038653075696,
"signal/format_reward/group_zero_std_frac": 0.9055555701255799,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.005786132719367743,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.005786132719367743,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0007456650491803885,
"signal/frontier_aurc_reward/group_std_mean": 0.001207607495598495,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 9.320813296653796e-06,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 9.320813296653796e-06,
"signal/frontier_ece_reward/centered_abs_mean": 0.008235116582363844,
"signal/frontier_ece_reward/group_std_mean": 0.011895155161619186,
"signal/frontier_ece_reward/group_zero_std_frac": 0.03888889010995626,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0008235116838477552,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0008235116838477552,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.280417799949646,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3480204403400421,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.18055555820465088,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.02804178111255169,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.02804178111255169,
"step": 170
},
{
"calibration/aurc": 0.08584934334504304,
"calibration/batch_distribution_entropy": 0.7040840441985023,
"calibration/buffer_distribution_entropy": 0.6904400099981481,
"calibration/confidence_entropy": 0.24128080965627605,
"calibration/coverage@0%": 0.20016072193439322,
"calibration/coverage@1%": 0.24455671389644537,
"calibration/coverage@10%": 0.6587761989219518,
"calibration/coverage@15%": 0.7744429150954781,
"calibration/coverage@20%": 0.8411959239143305,
"calibration/coverage@25%": 0.9012597940524044,
"calibration/coverage@30%": 0.9544108931847044,
"calibration/coverage@5%": 0.5571439707095154,
"calibration/ece": 0.08052070579497293,
"calibration/mean_confidence": 0.6720276185336083,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00616319444444442,
"completions/max_length": 3571.8,
"completions/max_terminated_length": 3571.8,
"completions/mean_length": 1022.02509765625,
"completions/mean_terminated_length": 1028.35546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 332.2,
"epoch": 0.4199947500656242,
"grad_norm": 0.000544732843991369,
"learning_rate": 9.93975903614458e-07,
"loss": -0.0048,
"num_tokens": 431586473.0,
"reward": 1.342452049255371,
"reward_std": 0.17677203714847564,
"rewards/accuracy_reward": 0.6801215291023255,
"rewards/batch_coverage_0": 0.4610255122184753,
"rewards/batch_coverage_1": 0.4610255122184753,
"rewards/batch_coverage_10": 0.5977589607238769,
"rewards/batch_coverage_15": 0.609941303730011,
"rewards/batch_coverage_20": 0.6217445015907288,
"rewards/batch_coverage_25": 0.6220832824707031,
"rewards/batch_coverage_5": 0.538027310371399,
"rewards/brier_reward": 0.8769816517829895,
"rewards/confidence_uniqueness_reward": 0.8499362587928772,
"rewards/format_reward": 0.9938368082046509,
"rewards/frontier_aurc_reward": -0.0008405924076214432,
"rewards/frontier_ece_reward": 0.008787432871758937,
"rewards/frontier_entropy_batch_reward": -0.5924777626991272,
"signal/accuracy_reward/centered_abs_mean": 0.13968641459941863,
"signal/accuracy_reward/group_std_mean": 0.19224803447723388,
"signal/accuracy_reward/group_zero_std_frac": 0.42777777910232545,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06984320729970932,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.06984320729970932,
"signal/advantage_abs_mean": 0.12613593339920043,
"signal/advantage_pre_scale_abs_mean": 0.12613593339920043,
"signal/advantage_pre_scale_std": 0.2150324821472168,
"signal/advantage_std": 0.2150324821472168,
"signal/batch_coverage_0/centered_abs_mean": 0.11822779774665833,
"signal/batch_coverage_0/group_std_mean": 0.16137382984161378,
"signal/batch_coverage_0/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011822779849171638,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.011822779849171638,
"signal/batch_coverage_1/centered_abs_mean": 0.11822779774665833,
"signal/batch_coverage_1/group_std_mean": 0.16137382984161378,
"signal/batch_coverage_1/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011822779849171638,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.011822779849171638,
"signal/batch_coverage_10/centered_abs_mean": 0.14888089001178742,
"signal/batch_coverage_10/group_std_mean": 0.20695665776729583,
"signal/batch_coverage_10/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01488808896392584,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01488808896392584,
"signal/batch_coverage_15/centered_abs_mean": 0.15403354167938232,
"signal/batch_coverage_15/group_std_mean": 0.21282553374767305,
"signal/batch_coverage_15/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.015403354167938232,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.015403354167938232,
"signal/batch_coverage_20/centered_abs_mean": 0.16212478578090667,
"signal/batch_coverage_20/group_std_mean": 0.22311893701553345,
"signal/batch_coverage_20/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.016212479583919047,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.016212479583919047,
"signal/batch_coverage_25/centered_abs_mean": 0.1614815503358841,
"signal/batch_coverage_25/group_std_mean": 0.22323354780673982,
"signal/batch_coverage_25/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.01614815555512905,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.01614815555512905,
"signal/batch_coverage_5/centered_abs_mean": 0.12908718585968018,
"signal/batch_coverage_5/group_std_mean": 0.1776062160730362,
"signal/batch_coverage_5/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012908719293773175,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012908719293773175,
"signal/brier_reward/centered_abs_mean": 0.10861355811357498,
"signal/brier_reward/group_std_mean": 0.15155054926872252,
"signal/brier_reward/group_zero_std_frac": 0.002777777798473835,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.010861356370151043,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.010861356370151043,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.0684033289551735,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09072864055633545,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.002777777798473835,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.006840333249419928,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006840333249419928,
"signal/format_reward/centered_abs_mean": 0.01100802943110466,
"signal/format_reward/group_std_mean": 0.021973739936947824,
"signal/format_reward/group_zero_std_frac": 0.9055555701255799,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.00550401471555233,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.00550401471555233,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0007584088016301394,
"signal/frontier_aurc_reward/group_std_mean": 0.001169906440190971,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 9.480110020376741e-06,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 9.480110020376741e-06,
"signal/frontier_ece_reward/centered_abs_mean": 0.008332806825637817,
"signal/frontier_ece_reward/group_std_mean": 0.012288989685475826,
"signal/frontier_ece_reward/group_zero_std_frac": 0.02222222238779068,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0008332806755788624,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0008332806755788624,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.2823038697242737,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.35158644914627074,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.16111111342906953,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.028230386972427367,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.028230386972427367,
"step": 175
},
{
"calibration/aurc": 0.0935197326499746,
"calibration/batch_distribution_entropy": 0.5561873594983229,
"calibration/buffer_distribution_entropy": 0.6859575233089831,
"calibration/confidence_entropy": 0.19136952886651085,
"calibration/coverage@0%": 0.0421875,
"calibration/coverage@1%": 0.0421875,
"calibration/coverage@10%": 0.6712114141386544,
"calibration/coverage@15%": 0.800580566611605,
"calibration/coverage@20%": 0.8736033708606643,
"calibration/coverage@25%": 0.9231577989486137,
"calibration/coverage@30%": 0.9632084782897822,
"calibration/coverage@5%": 0.38217653629633197,
"calibration/ece": 0.08765554258794374,
"calibration/mean_confidence": 0.7494340338297194,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008420138888888862,
"completions/max_length": 3728.2,
"completions/max_terminated_length": 3728.2,
"completions/mean_length": 996.6256103515625,
"completions/mean_terminated_length": 1005.0981567382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 355.6,
"epoch": 0.4319946000674992,
"grad_norm": 0.0005954225780442357,
"learning_rate": 8.433734939759036e-07,
"loss": -0.0082,
"num_tokens": 446167568.0,
"reward": 1.3270483493804932,
"reward_std": 0.177334925532341,
"rewards/accuracy_reward": 0.6724826335906983,
"rewards/batch_coverage_0": 0.47177304029464723,
"rewards/batch_coverage_1": 0.47177304029464723,
"rewards/batch_coverage_10": 0.5869078040122986,
"rewards/batch_coverage_15": 0.5977437496185303,
"rewards/batch_coverage_20": 0.6142584323883057,
"rewards/batch_coverage_25": 0.620146906375885,
"rewards/batch_coverage_5": 0.5393850266933441,
"rewards/brier_reward": 0.8569316625595093,
"rewards/confidence_uniqueness_reward": 0.806365144252777,
"rewards/format_reward": 0.9914930582046508,
"rewards/frontier_aurc_reward": -0.001556425471790135,
"rewards/frontier_ece_reward": 0.008400358818471431,
"rewards/frontier_entropy_batch_reward": -0.622886312007904,
"signal/accuracy_reward/centered_abs_mean": 0.15272895097732545,
"signal/accuracy_reward/group_std_mean": 0.1959119915962219,
"signal/accuracy_reward/group_zero_std_frac": 0.46111111640930175,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07636447548866272,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07636447548866272,
"signal/advantage_abs_mean": 0.13030389845371246,
"signal/advantage_pre_scale_abs_mean": 0.13030389845371246,
"signal/advantage_pre_scale_std": 0.22473380863666534,
"signal/advantage_std": 0.22473380863666534,
"signal/batch_coverage_0/centered_abs_mean": 0.11375184804201126,
"signal/batch_coverage_0/group_std_mean": 0.1565089762210846,
"signal/batch_coverage_0/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.01137518547475338,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01137518547475338,
"signal/batch_coverage_1/centered_abs_mean": 0.11375184804201126,
"signal/batch_coverage_1/group_std_mean": 0.1565089762210846,
"signal/batch_coverage_1/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.01137518547475338,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.01137518547475338,
"signal/batch_coverage_10/centered_abs_mean": 0.13260578811168672,
"signal/batch_coverage_10/group_std_mean": 0.18635527193546295,
"signal/batch_coverage_10/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.013260579481720925,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.013260579481720925,
"signal/batch_coverage_15/centered_abs_mean": 0.13684237003326416,
"signal/batch_coverage_15/group_std_mean": 0.19186861515045167,
"signal/batch_coverage_15/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.013684236630797385,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.013684236630797385,
"signal/batch_coverage_20/centered_abs_mean": 0.14961515069007875,
"signal/batch_coverage_20/group_std_mean": 0.2078137993812561,
"signal/batch_coverage_20/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.014961515367031098,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.014961515367031098,
"signal/batch_coverage_25/centered_abs_mean": 0.15616746842861176,
"signal/batch_coverage_25/group_std_mean": 0.21553831696510314,
"signal/batch_coverage_25/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015616746619343758,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015616746619343758,
"signal/batch_coverage_5/centered_abs_mean": 0.12186728864908218,
"signal/batch_coverage_5/group_std_mean": 0.169427090883255,
"signal/batch_coverage_5/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012186729349195957,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012186729349195957,
"signal/brier_reward/centered_abs_mean": 0.11513119339942932,
"signal/brier_reward/group_std_mean": 0.15451994240283967,
"signal/brier_reward/group_zero_std_frac": 0.008333333395421505,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011513119749724864,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011513119749724864,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.082037752866745,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10694921165704727,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.008333333395421505,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008203774876892566,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008203774876892566,
"signal/format_reward/centered_abs_mean": 0.013975694589316846,
"signal/format_reward/group_std_mean": 0.02479696534574032,
"signal/format_reward/group_zero_std_frac": 0.9000000119209289,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.006987847294658423,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006987847294658423,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0012580604292452336,
"signal/frontier_aurc_reward/group_std_mean": 0.0019183105323463678,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.5725756384199485e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.5725756384199485e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.007519526779651642,
"signal/frontier_ece_reward/group_std_mean": 0.01085415929555893,
"signal/frontier_ece_reward/group_zero_std_frac": 0.027777778171002864,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.00075195268727839,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.00075195268727839,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.25678522884845734,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3297272324562073,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.18055555522441863,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.025678522139787673,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.025678522139787673,
"step": 180
},
{
"calibration/aurc": 0.14623769358867256,
"calibration/batch_distribution_entropy": 0.6229734962511724,
"calibration/buffer_distribution_entropy": 0.6802651233028907,
"calibration/confidence_entropy": 0.23214373257914972,
"calibration/coverage@0%": 0.03498014577893821,
"calibration/coverage@1%": 0.10129868363794606,
"calibration/coverage@10%": 0.29276952785030463,
"calibration/coverage@15%": 0.7551255491317501,
"calibration/coverage@20%": 0.855133763840992,
"calibration/coverage@25%": 0.9205799220272904,
"calibration/coverage@30%": 0.9515625,
"calibration/coverage@5%": 0.10129868363794606,
"calibration/ece": 0.141819133742561,
"calibration/mean_confidence": 0.7414098737269303,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007899305555555559,
"completions/max_length": 3349.6,
"completions/max_terminated_length": 3349.6,
"completions/mean_length": 989.527099609375,
"completions/mean_terminated_length": 997.413623046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 297.6,
"epoch": 0.44399445006937416,
"grad_norm": 0.0005679742316715419,
"learning_rate": 6.927710843373495e-07,
"loss": -0.008,
"num_tokens": 460656968.0,
"reward": 1.3158263444900513,
"reward_std": 0.17870357632637024,
"rewards/accuracy_reward": 0.650868046283722,
"rewards/batch_coverage_0": 0.4548503696918488,
"rewards/batch_coverage_1": 0.4548503696918488,
"rewards/batch_coverage_10": 0.5850084662437439,
"rewards/batch_coverage_15": 0.5976876854896546,
"rewards/batch_coverage_20": 0.6115280747413635,
"rewards/batch_coverage_25": 0.6098129034042359,
"rewards/batch_coverage_5": 0.5300012052059173,
"rewards/brier_reward": 0.8578875422477722,
"rewards/confidence_uniqueness_reward": 0.832545804977417,
"rewards/format_reward": 0.9921006917953491,
"rewards/frontier_aurc_reward": -0.001072891615331173,
"rewards/frontier_ece_reward": 0.006930419430136681,
"rewards/frontier_entropy_batch_reward": -0.5975492119789123,
"signal/accuracy_reward/centered_abs_mean": 0.14483506977558136,
"signal/accuracy_reward/group_std_mean": 0.1936337411403656,
"signal/accuracy_reward/group_zero_std_frac": 0.4444444477558136,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07241753488779068,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07241753488779068,
"signal/advantage_abs_mean": 0.12828055024147034,
"signal/advantage_pre_scale_abs_mean": 0.12828055024147034,
"signal/advantage_pre_scale_std": 0.2207647293806076,
"signal/advantage_std": 0.2207647293806076,
"signal/batch_coverage_0/centered_abs_mean": 0.11986812949180603,
"signal/batch_coverage_0/group_std_mean": 0.1622163325548172,
"signal/batch_coverage_0/group_zero_std_frac": 0.016666666977107523,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011986814253032208,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.011986814253032208,
"signal/batch_coverage_1/centered_abs_mean": 0.11986812949180603,
"signal/batch_coverage_1/group_std_mean": 0.1622163325548172,
"signal/batch_coverage_1/group_zero_std_frac": 0.016666666977107523,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011986814253032208,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.011986814253032208,
"signal/batch_coverage_10/centered_abs_mean": 0.1427648186683655,
"signal/batch_coverage_10/group_std_mean": 0.19960642158985137,
"signal/batch_coverage_10/group_zero_std_frac": 0.016666666977107523,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.014276482164859772,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.014276482164859772,
"signal/batch_coverage_15/centered_abs_mean": 0.1482395738363266,
"signal/batch_coverage_15/group_std_mean": 0.20609240233898163,
"signal/batch_coverage_15/group_zero_std_frac": 0.016666666977107523,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.014823957718908787,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.014823957718908787,
"signal/batch_coverage_20/centered_abs_mean": 0.15430747270584105,
"signal/batch_coverage_20/group_std_mean": 0.21396797895431519,
"signal/batch_coverage_20/group_zero_std_frac": 0.016666666977107523,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.015430747903883458,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.015430747903883458,
"signal/batch_coverage_25/centered_abs_mean": 0.1554022252559662,
"signal/batch_coverage_25/group_std_mean": 0.21482096016407012,
"signal/batch_coverage_25/group_zero_std_frac": 0.016666666977107523,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015540223196148872,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015540223196148872,
"signal/batch_coverage_5/centered_abs_mean": 0.1292220577597618,
"signal/batch_coverage_5/group_std_mean": 0.17766384184360504,
"signal/batch_coverage_5/group_zero_std_frac": 0.016666666977107523,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012922205589711666,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012922205589711666,
"signal/brier_reward/centered_abs_mean": 0.11412379443645478,
"signal/brier_reward/group_std_mean": 0.15445701479911805,
"signal/brier_reward/group_zero_std_frac": 0.016666666977107523,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011412379890680313,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011412379890680313,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07410852909088135,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09989662021398545,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.016666666977107523,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0074108530767261985,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0074108530767261985,
"signal/format_reward/centered_abs_mean": 0.01372070349752903,
"signal/format_reward/group_std_mean": 0.027843139693140985,
"signal/format_reward/group_zero_std_frac": 0.875,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.006860351748764515,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006860351748764515,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0009126473218202591,
"signal/frontier_aurc_reward/group_std_mean": 0.001439414289779961,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.1408091813791543e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.1408091813791543e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.00754654137417674,
"signal/frontier_ece_reward/group_std_mean": 0.010986723564565181,
"signal/frontier_ece_reward/group_zero_std_frac": 0.02777777798473835,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0007546541397459805,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0007546541397459805,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.27296812534332277,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.33934701681137086,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.18611110895872116,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.027296814322471618,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.027296814322471618,
"step": 185
},
{
"calibration/aurc": 0.13128068853617544,
"calibration/batch_distribution_entropy": 0.4795554241420736,
"calibration/buffer_distribution_entropy": 0.6778663229174313,
"calibration/confidence_entropy": 0.17752499581463171,
"calibration/coverage@0%": 0.03769633507853403,
"calibration/coverage@1%": 0.03769633507853403,
"calibration/coverage@10%": 0.47621794820527397,
"calibration/coverage@15%": 0.5475001368753905,
"calibration/coverage@20%": 0.8411420084895374,
"calibration/coverage@25%": 0.9311601630611935,
"calibration/coverage@30%": 0.9657873888272697,
"calibration/coverage@5%": 0.2875282446955084,
"calibration/ece": 0.10620229144234192,
"calibration/mean_confidence": 0.7968994675856337,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006336805555555558,
"completions/max_length": 3539.6,
"completions/max_terminated_length": 3539.6,
"completions/mean_length": 988.577001953125,
"completions/mean_terminated_length": 994.8688232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 366.4,
"epoch": 0.45599430007124914,
"grad_norm": 0.0006201478536240757,
"learning_rate": 5.421686746987952e-07,
"loss": -0.007,
"num_tokens": 475128319.0,
"reward": 1.3506378889083863,
"reward_std": 0.17299841344356537,
"rewards/accuracy_reward": 0.7050347328186035,
"rewards/batch_coverage_0": 0.4621035397052765,
"rewards/batch_coverage_1": 0.4621035397052765,
"rewards/batch_coverage_10": 0.6018279314041137,
"rewards/batch_coverage_15": 0.6118657708168029,
"rewards/batch_coverage_20": 0.6287239551544189,
"rewards/batch_coverage_25": 0.6334514379501343,
"rewards/batch_coverage_5": 0.552715665102005,
"rewards/brier_reward": 0.8670693159103393,
"rewards/confidence_uniqueness_reward": 0.8103796005249023,
"rewards/format_reward": 0.9936631917953491,
"rewards/frontier_aurc_reward": -0.0011533482233062387,
"rewards/frontier_ece_reward": 0.006376712769269943,
"rewards/frontier_entropy_batch_reward": -0.6235847473144531,
"signal/accuracy_reward/centered_abs_mean": 0.14418402314186096,
"signal/accuracy_reward/group_std_mean": 0.19555995464324952,
"signal/accuracy_reward/group_zero_std_frac": 0.4222222208976746,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07209201157093048,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07209201157093048,
"signal/advantage_abs_mean": 0.12151096612215043,
"signal/advantage_pre_scale_abs_mean": 0.12151096612215043,
"signal/advantage_pre_scale_std": 0.21480103731155395,
"signal/advantage_std": 0.21480103731155395,
"signal/batch_coverage_0/centered_abs_mean": 0.11545856446027755,
"signal/batch_coverage_0/group_std_mean": 0.15694105625152588,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011545856110751629,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.011545856110751629,
"signal/batch_coverage_1/centered_abs_mean": 0.11545856446027755,
"signal/batch_coverage_1/group_std_mean": 0.15694105625152588,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011545856110751629,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.011545856110751629,
"signal/batch_coverage_10/centered_abs_mean": 0.13779610842466355,
"signal/batch_coverage_10/group_std_mean": 0.19468652307987214,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.013779611513018607,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.013779611513018607,
"signal/batch_coverage_15/centered_abs_mean": 0.14144036769866944,
"signal/batch_coverage_15/group_std_mean": 0.19916775822639465,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01414403710514307,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01414403710514307,
"signal/batch_coverage_20/centered_abs_mean": 0.15380342304706573,
"signal/batch_coverage_20/group_std_mean": 0.21572276055812836,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.015380342863500119,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.015380342863500119,
"signal/batch_coverage_25/centered_abs_mean": 0.15857919156551362,
"signal/batch_coverage_25/group_std_mean": 0.22120807468891143,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015857919491827487,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015857919491827487,
"signal/batch_coverage_5/centered_abs_mean": 0.12643881738185883,
"signal/batch_coverage_5/group_std_mean": 0.17589333653450012,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.01264388207346201,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.01264388207346201,
"signal/brier_reward/centered_abs_mean": 0.11211207658052444,
"signal/brier_reward/group_std_mean": 0.15370774865150452,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011211208067834377,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011211208067834377,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08398310542106628,
"signal/confidence_uniqueness_reward/group_std_mean": 0.11177106499671936,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008398310374468565,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008398310374468565,
"signal/format_reward/centered_abs_mean": 0.011474609375,
"signal/format_reward/group_std_mean": 0.024381159618496896,
"signal/format_reward/group_zero_std_frac": 0.8916666746139527,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0057373046875,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0057373046875,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0010549181955866517,
"signal/frontier_aurc_reward/group_std_mean": 0.0016289620660245418,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.3186477372073568e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.3186477372073568e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.006933016143739224,
"signal/frontier_ece_reward/group_std_mean": 0.010268824733793735,
"signal/frontier_ece_reward/group_zero_std_frac": 0.022222222574055196,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.000693301623687148,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.000693301623687148,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.25669166445732117,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3272492527961731,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.2,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.025669166445732118,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.025669166445732118,
"step": 190
},
{
"calibration/aurc": 0.11944810496438059,
"calibration/batch_distribution_entropy": 0.6395999867984298,
"calibration/buffer_distribution_entropy": 0.6720075917237143,
"calibration/confidence_entropy": 0.20917919309424385,
"calibration/coverage@0%": 0.06911732456140351,
"calibration/coverage@1%": 0.14488486842105264,
"calibration/coverage@10%": 0.5310995338477082,
"calibration/coverage@15%": 0.6642291838890421,
"calibration/coverage@20%": 0.8316384506751172,
"calibration/coverage@25%": 0.8892670157068062,
"calibration/coverage@30%": 0.9388170811518325,
"calibration/coverage@5%": 0.37537739965095984,
"calibration/ece": 0.13485540453524325,
"calibration/mean_confidence": 0.6940419641656674,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500000000023,
"completions/max_length": 3358.0,
"completions/max_terminated_length": 3358.0,
"completions/mean_length": 1008.2052124023437,
"completions/mean_terminated_length": 1016.1783325195313,
"completions/min_length": 0.0,
"completions/min_terminated_length": 331.4,
"epoch": 0.46799415007312406,
"grad_norm": 0.0005691568367183208,
"learning_rate": 3.91566265060241e-07,
"loss": -0.0071,
"num_tokens": 489823707.0,
"reward": 1.3036153316497803,
"reward_std": 0.1823348581790924,
"rewards/accuracy_reward": 0.6552083373069764,
"rewards/batch_coverage_0": 0.4402458965778351,
"rewards/batch_coverage_1": 0.4402458965778351,
"rewards/batch_coverage_10": 0.5593804001808167,
"rewards/batch_coverage_15": 0.5775806784629822,
"rewards/batch_coverage_20": 0.6022377133369445,
"rewards/batch_coverage_25": 0.6144470334053039,
"rewards/batch_coverage_5": 0.5079279005527496,
"rewards/brier_reward": 0.8445484519004822,
"rewards/confidence_uniqueness_reward": 0.8110572457313537,
"rewards/format_reward": 0.9921875,
"rewards/frontier_aurc_reward": -0.001369133684784174,
"rewards/frontier_ece_reward": 0.005712912511080503,
"rewards/frontier_entropy_batch_reward": -0.6040391564369202,
"signal/accuracy_reward/centered_abs_mean": 0.14594184011220931,
"signal/accuracy_reward/group_std_mean": 0.19639940857887267,
"signal/accuracy_reward/group_zero_std_frac": 0.4333333373069763,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07297092005610466,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07297092005610466,
"signal/advantage_abs_mean": 0.13279823064804078,
"signal/advantage_pre_scale_abs_mean": 0.13279823064804078,
"signal/advantage_pre_scale_std": 0.22284360826015473,
"signal/advantage_std": 0.22284360826015473,
"signal/batch_coverage_0/centered_abs_mean": 0.12249585688114166,
"signal/batch_coverage_0/group_std_mean": 0.16909070312976837,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.012249586079269648,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.012249586079269648,
"signal/batch_coverage_1/centered_abs_mean": 0.12249585688114166,
"signal/batch_coverage_1/group_std_mean": 0.16909070312976837,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.012249586079269648,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.012249586079269648,
"signal/batch_coverage_10/centered_abs_mean": 0.14008867293596267,
"signal/batch_coverage_10/group_std_mean": 0.19668514728546144,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.014008867554366589,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.014008867554366589,
"signal/batch_coverage_15/centered_abs_mean": 0.14652444720268248,
"signal/batch_coverage_15/group_std_mean": 0.20595037341117858,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.01465244460850954,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.01465244460850954,
"signal/batch_coverage_20/centered_abs_mean": 0.16244602501392363,
"signal/batch_coverage_20/group_std_mean": 0.22658770382404328,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.016244602762162684,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.016244602762162684,
"signal/batch_coverage_25/centered_abs_mean": 0.17667838633060456,
"signal/batch_coverage_25/group_std_mean": 0.2433231681585312,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.01766783855855465,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.01766783855855465,
"signal/batch_coverage_5/centered_abs_mean": 0.12719610780477525,
"signal/batch_coverage_5/group_std_mean": 0.1771041363477707,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012719610892236232,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012719610892236232,
"signal/brier_reward/centered_abs_mean": 0.12355803400278091,
"signal/brier_reward/group_std_mean": 0.1653554379940033,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.01235580388456583,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.01235580388456583,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08606803268194199,
"signal/confidence_uniqueness_reward/group_std_mean": 0.11045795083045959,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008606803603470325,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008606803603470325,
"signal/format_reward/centered_abs_mean": 0.011447482369840145,
"signal/format_reward/group_std_mean": 0.020899421907961367,
"signal/format_reward/group_zero_std_frac": 0.9138888955116272,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.005723741184920072,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.005723741184920072,
"signal/frontier_aurc_reward/centered_abs_mean": 0.00131355298217386,
"signal/frontier_aurc_reward/group_std_mean": 0.0020352060673758388,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.641941307752859e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.641941307752859e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.007170586753636599,
"signal/frontier_ece_reward/group_std_mean": 0.010662421584129333,
"signal/frontier_ece_reward/group_zero_std_frac": 0.03333333395421505,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0007170586846768856,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0007170586846768856,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.27514847218990324,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3442379176616669,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.1833333343267441,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.027514847368001936,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.027514847368001936,
"step": 195
},
{
"calibration/aurc": 0.12895234263738056,
"calibration/batch_distribution_entropy": 0.5441346575497024,
"calibration/buffer_distribution_entropy": 0.6665037511135818,
"calibration/confidence_entropy": 0.19706353954289205,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.05706806282722513,
"calibration/coverage@10%": 0.4662888173761813,
"calibration/coverage@15%": 0.6941641812131418,
"calibration/coverage@20%": 0.894394340529808,
"calibration/coverage@25%": 0.9372682155322863,
"calibration/coverage@30%": 0.9618128272251308,
"calibration/coverage@5%": 0.3992024414655271,
"calibration/ece": 0.12510555245208535,
"calibration/mean_confidence": 0.7964144439428475,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0065104166666666964,
"completions/max_length": 3574.4,
"completions/max_terminated_length": 3574.4,
"completions/mean_length": 991.6319702148437,
"completions/mean_terminated_length": 998.1380615234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.47999400007499904,
"grad_norm": 0.0006386162131093442,
"learning_rate": 2.409638554216868e-07,
"loss": -0.0062,
"num_tokens": 504315115.0,
"reward": 1.3395307540893555,
"reward_std": 0.1739016592502594,
"rewards/accuracy_reward": 0.6907986044883728,
"rewards/batch_coverage_0": 0.46083222031593324,
"rewards/batch_coverage_1": 0.46083222031593324,
"rewards/batch_coverage_10": 0.5888261795043945,
"rewards/batch_coverage_15": 0.6063881635665893,
"rewards/batch_coverage_20": 0.6214998483657836,
"rewards/batch_coverage_25": 0.6275609135627747,
"rewards/batch_coverage_5": 0.5429104447364808,
"rewards/brier_reward": 0.8572309732437133,
"rewards/confidence_uniqueness_reward": 0.816546094417572,
"rewards/format_reward": 0.9934027791023254,
"rewards/frontier_aurc_reward": -0.001357408077456057,
"rewards/frontier_ece_reward": 0.005987100955098868,
"rewards/frontier_entropy_batch_reward": -0.6141442060470581,
"signal/accuracy_reward/centered_abs_mean": 0.14255641996860505,
"signal/accuracy_reward/group_std_mean": 0.18642587959766388,
"signal/accuracy_reward/group_zero_std_frac": 0.475,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07127820998430252,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07127820998430252,
"signal/advantage_abs_mean": 0.12579307109117507,
"signal/advantage_pre_scale_abs_mean": 0.12579307109117507,
"signal/advantage_pre_scale_std": 0.2174198657274246,
"signal/advantage_std": 0.2174198657274246,
"signal/batch_coverage_0/centered_abs_mean": 0.11439124047756195,
"signal/batch_coverage_0/group_std_mean": 0.15456107556819915,
"signal/batch_coverage_0/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011439124494791031,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.011439124494791031,
"signal/batch_coverage_1/centered_abs_mean": 0.11439124047756195,
"signal/batch_coverage_1/group_std_mean": 0.15456107556819915,
"signal/batch_coverage_1/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011439124494791031,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.011439124494791031,
"signal/batch_coverage_10/centered_abs_mean": 0.13597213476896286,
"signal/batch_coverage_10/group_std_mean": 0.18966183364391326,
"signal/batch_coverage_10/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.01359721329063177,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.01359721329063177,
"signal/batch_coverage_15/centered_abs_mean": 0.14407963156700135,
"signal/batch_coverage_15/group_std_mean": 0.20045292973518372,
"signal/batch_coverage_15/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.014407963491976261,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.014407963491976261,
"signal/batch_coverage_20/centered_abs_mean": 0.14974366426467894,
"signal/batch_coverage_20/group_std_mean": 0.20867671072483063,
"signal/batch_coverage_20/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.014974366687238217,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.014974366687238217,
"signal/batch_coverage_25/centered_abs_mean": 0.15635756254196168,
"signal/batch_coverage_25/group_std_mean": 0.21556124687194825,
"signal/batch_coverage_25/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015635756216943263,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015635756216943263,
"signal/batch_coverage_5/centered_abs_mean": 0.12410448342561722,
"signal/batch_coverage_5/group_std_mean": 0.17132916748523713,
"signal/batch_coverage_5/group_zero_std_frac": 0.002777777798473835,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012410448491573333,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012410448491573333,
"signal/brier_reward/centered_abs_mean": 0.11009028404951096,
"signal/brier_reward/group_std_mean": 0.14840916395187378,
"signal/brier_reward/group_zero_std_frac": 0.002777777798473835,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011009028740227222,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011009028740227222,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.08328192979097367,
"signal/confidence_uniqueness_reward/group_std_mean": 0.11021423190832139,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.002777777798473835,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.008328192867338657,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008328192867338657,
"signal/format_reward/centered_abs_mean": 0.012076823133975267,
"signal/format_reward/group_std_mean": 0.026156437024474143,
"signal/format_reward/group_zero_std_frac": 0.8805555582046509,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0060384115669876335,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0060384115669876335,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0012080914457328618,
"signal/frontier_aurc_reward/group_std_mean": 0.001900116284377873,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.5101143071660772e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.5101143071660772e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.006761191785335541,
"signal/frontier_ece_reward/group_std_mean": 0.009993968904018402,
"signal/frontier_ece_reward/group_zero_std_frac": 0.03333333395421505,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0006761191645637155,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0006761191645637155,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.25325422883033755,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3234684646129608,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.1944444477558136,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.0253254234790802,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0253254234790802,
"step": 200
},
{
"epoch": 0.47999400007499904,
"eval_calibration/aurc": 0.13414728793506886,
"eval_calibration/batch_distribution_entropy": 0.5747845907279023,
"eval_calibration/buffer_distribution_entropy": 0.66335620413197,
"eval_calibration/confidence_entropy": 0.20467980469134264,
"eval_calibration/coverage@0%": 0.17825940860215053,
"eval_calibration/coverage@1%": 0.17825940860215053,
"eval_calibration/coverage@10%": 0.3970094086021505,
"eval_calibration/coverage@15%": 0.76377688172043,
"eval_calibration/coverage@20%": 0.8629032258064516,
"eval_calibration/coverage@25%": 0.9102822580645161,
"eval_calibration/coverage@30%": 0.9469086021505376,
"eval_calibration/coverage@5%": 0.21471774193548387,
"eval_calibration/ece": 0.18191234422153416,
"eval_calibration/mean_confidence": 0.7088780976480996,
"eval_completions/clipped_ratio": 0.002604166666666685,
"eval_completions/max_length": 2817.1666666666665,
"eval_completions/max_terminated_length": 2817.1666666666665,
"eval_completions/mean_length": 1001.8477579752604,
"eval_completions/mean_terminated_length": 1004.473866780599,
"eval_completions/min_length": 224.83333333333334,
"eval_completions/min_terminated_length": 424.8333333333333,
"eval_loss": 0.0,
"eval_num_tokens": 504315115.0,
"eval_reward": 1.0336094597975414,
"eval_reward_std": 0.2966693192720413,
"eval_rewards/accuracy_reward": 0.667534718910853,
"eval_rewards/batch_coverage_0": 0.11816969886422157,
"eval_rewards/batch_coverage_1": 0.11816969886422157,
"eval_rewards/batch_coverage_10": 0.15712855011224747,
"eval_rewards/batch_coverage_15": 0.20995599528153738,
"eval_rewards/batch_coverage_20": 0.2796262999375661,
"eval_rewards/batch_coverage_25": 0.3676889066894849,
"eval_rewards/batch_coverage_5": 0.12367384632428487,
"eval_rewards/brier_reward": 0.845408578713735,
"eval_rewards/confidence_uniqueness_reward": 0.7948392828305563,
"eval_rewards/format_reward": 0.9947916766007742,
"eval_rewards/frontier_aurc_reward": -0.0013247675669845194,
"eval_rewards/frontier_ece_reward": 0.0047586263390257955,
"eval_rewards/frontier_entropy_batch_reward": -0.9947916766007742,
"eval_runtime": 188.2209,
"eval_samples_per_second": 5.313,
"eval_signal/accuracy_reward/centered_abs_mean": 0.4306098024050395,
"eval_signal/accuracy_reward/group_std_mean": 0.47044746081034344,
"eval_signal/accuracy_reward/group_zero_std_frac": 0.0,
"eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21530490120251974,
"eval_signal/accuracy_reward/weight": 0.5,
"eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21530490120251974,
"eval_signal/advantage_abs_mean": 0.24172887206077576,
"eval_signal/advantage_pre_scale_abs_mean": 0.24172887206077576,
"eval_signal/advantage_pre_scale_std": 0.2949426124493281,
"eval_signal/advantage_std": 0.2949426124493281,
"eval_signal/batch_coverage_0/centered_abs_mean": 0.27022289981444675,
"eval_signal/batch_coverage_0/group_std_mean": 0.38285597662130993,
"eval_signal/batch_coverage_0/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.02702228942265113,
"eval_signal/batch_coverage_0/weight": 0.10000000149011612,
"eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.02702228942265113,
"eval_signal/batch_coverage_1/centered_abs_mean": 0.27022289981444675,
"eval_signal/batch_coverage_1/group_std_mean": 0.38285597662130993,
"eval_signal/batch_coverage_1/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.02702228942265113,
"eval_signal/batch_coverage_1/weight": 0.10000000149011612,
"eval_signal/batch_coverage_1/weighted_centered_abs_mean": 0.02702228942265113,
"eval_signal/batch_coverage_10/centered_abs_mean": 0.2386037011941274,
"eval_signal/batch_coverage_10/group_std_mean": 0.32710455854733783,
"eval_signal/batch_coverage_10/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.02386037011941274,
"eval_signal/batch_coverage_10/weight": 0.10000000149011612,
"eval_signal/batch_coverage_10/weighted_centered_abs_mean": 0.02386037011941274,
"eval_signal/batch_coverage_15/centered_abs_mean": 0.25227805972099304,
"eval_signal/batch_coverage_15/group_std_mean": 0.32569801310698193,
"eval_signal/batch_coverage_15/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.02522780777265628,
"eval_signal/batch_coverage_15/weight": 0.10000000149011612,
"eval_signal/batch_coverage_15/weighted_centered_abs_mean": 0.02522780777265628,
"eval_signal/batch_coverage_20/centered_abs_mean": 0.285963773727417,
"eval_signal/batch_coverage_20/group_std_mean": 0.3457394639650981,
"eval_signal/batch_coverage_20/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.02859637700021267,
"eval_signal/batch_coverage_20/weight": 0.10000000149011612,
"eval_signal/batch_coverage_20/weighted_centered_abs_mean": 0.02859637700021267,
"eval_signal/batch_coverage_25/centered_abs_mean": 0.39723547796408337,
"eval_signal/batch_coverage_25/group_std_mean": 0.4638222207625707,
"eval_signal/batch_coverage_25/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.03972354965905348,
"eval_signal/batch_coverage_25/weight": 0.10000000149011612,
"eval_signal/batch_coverage_25/weighted_centered_abs_mean": 0.03972354965905348,
"eval_signal/batch_coverage_5/centered_abs_mean": 0.2554217278957367,
"eval_signal/batch_coverage_5/group_std_mean": 0.36187784870465595,
"eval_signal/batch_coverage_5/group_zero_std_frac": 0.0,
"eval_signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.025542172603309155,
"eval_signal/batch_coverage_5/weight": 0.10000000149011612,
"eval_signal/batch_coverage_5/weighted_centered_abs_mean": 0.025542172603309155,
"eval_signal/brier_reward/centered_abs_mean": 0.22214544067780176,
"eval_signal/brier_reward/group_std_mean": 0.29984481632709503,
"eval_signal/brier_reward/group_zero_std_frac": 0.0,
"eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.022214544626573723,
"eval_signal/brier_reward/weight": 0.10000000149011612,
"eval_signal/brier_reward/weighted_centered_abs_mean": 0.022214544626573723,
"eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.1260842519501845,
"eval_signal/confidence_uniqueness_reward/group_std_mean": 0.14876357093453407,
"eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.012608425381282965,
"eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.012608425381282965,
"eval_signal/format_reward/centered_abs_mean": 0.009982638837148746,
"eval_signal/format_reward/group_std_mean": 0.026473373485108215,
"eval_signal/format_reward/group_zero_std_frac": 0.8611111342906952,
"eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.004991319418574373,
"eval_signal/format_reward/weight": 0.5,
"eval_signal/format_reward/weighted_centered_abs_mean": 0.004991319418574373,
"eval_signal/frontier_aurc_reward/centered_abs_mean": 0.0020859644864685833,
"eval_signal/frontier_aurc_reward/group_std_mean": 0.004563994938507676,
"eval_signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 2.607455629307272e-05,
"eval_signal/frontier_aurc_reward/weight": 0.012500000186264515,
"eval_signal/frontier_aurc_reward/weighted_centered_abs_mean": 2.607455629307272e-05,
"eval_signal/frontier_ece_reward/centered_abs_mean": 0.008840923585618535,
"eval_signal/frontier_ece_reward/group_std_mean": 0.014323081355541945,
"eval_signal/frontier_ece_reward/group_zero_std_frac": 0.0,
"eval_signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0008840923255775124,
"eval_signal/frontier_ece_reward/weight": 0.10000000149011612,
"eval_signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0008840923255775124,
"eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.009982638837148746,
"eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.026473373485108215,
"eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.8611111342906952,
"eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.0009982639652055998,
"eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0009982639652055998,
"eval_steps_per_second": 0.032,
"step": 200
},
{
"calibration/aurc": 0.1384997430281988,
"calibration/batch_distribution_entropy": 0.5506939823270676,
"calibration/buffer_distribution_entropy": 0.6636535644652605,
"calibration/confidence_entropy": 0.18998257839477467,
"calibration/coverage@0%": 0.0,
"calibration/coverage@1%": 0.05301837270341207,
"calibration/coverage@10%": 0.402384636060499,
"calibration/coverage@15%": 0.6366007892223926,
"calibration/coverage@20%": 0.8371666648008708,
"calibration/coverage@25%": 0.8847188877100887,
"calibration/coverage@30%": 0.923436125769569,
"calibration/coverage@5%": 0.26199896856972693,
"calibration/ece": 0.13033047827590277,
"calibration/mean_confidence": 0.7532977819177902,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007204861111111094,
"completions/max_length": 3430.6,
"completions/max_terminated_length": 3430.6,
"completions/mean_length": 1005.3292724609375,
"completions/mean_terminated_length": 1012.6463989257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.491993850076874,
"grad_norm": 0.0005171684897504747,
"learning_rate": 9.036144578313253e-08,
"loss": -0.0062,
"num_tokens": 518962460.0,
"reward": 1.3564418792724608,
"reward_std": 0.17012038230895996,
"rewards/accuracy_reward": 0.7057291746139527,
"rewards/batch_coverage_0": 0.4720217764377594,
"rewards/batch_coverage_1": 0.4720217764377594,
"rewards/batch_coverage_10": 0.6029796481132508,
"rewards/batch_coverage_15": 0.6217110633850098,
"rewards/batch_coverage_20": 0.637014639377594,
"rewards/batch_coverage_25": 0.6375706434249878,
"rewards/batch_coverage_5": 0.5604067802429199,
"rewards/brier_reward": 0.8739692211151123,
"rewards/confidence_uniqueness_reward": 0.817440927028656,
"rewards/format_reward": 0.9927951335906983,
"rewards/frontier_aurc_reward": -0.000946720875799656,
"rewards/frontier_ece_reward": 0.005940455570816993,
"rewards/frontier_entropy_batch_reward": -0.6291620016098023,
"signal/accuracy_reward/centered_abs_mean": 0.1403971344232559,
"signal/accuracy_reward/group_std_mean": 0.18923761546611786,
"signal/accuracy_reward/group_zero_std_frac": 0.44444444179534914,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07019856721162795,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07019856721162795,
"signal/advantage_abs_mean": 0.11906714290380478,
"signal/advantage_pre_scale_abs_mean": 0.11906714290380478,
"signal/advantage_pre_scale_std": 0.21315287351608275,
"signal/advantage_std": 0.21315287351608275,
"signal/batch_coverage_0/centered_abs_mean": 0.11111615747213363,
"signal/batch_coverage_0/group_std_mean": 0.1536956250667572,
"signal/batch_coverage_0/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.011111615598201752,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.011111615598201752,
"signal/batch_coverage_1/centered_abs_mean": 0.11111615747213363,
"signal/batch_coverage_1/group_std_mean": 0.1536956250667572,
"signal/batch_coverage_1/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.011111615598201752,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.011111615598201752,
"signal/batch_coverage_10/centered_abs_mean": 0.12938762009143828,
"signal/batch_coverage_10/group_std_mean": 0.18510680198669432,
"signal/batch_coverage_10/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.012938761338591575,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.012938761338591575,
"signal/batch_coverage_15/centered_abs_mean": 0.1383065789937973,
"signal/batch_coverage_15/group_std_mean": 0.19660945236682892,
"signal/batch_coverage_15/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.013830658420920372,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.013830658420920372,
"signal/batch_coverage_20/centered_abs_mean": 0.1493982344865799,
"signal/batch_coverage_20/group_std_mean": 0.21061743199825286,
"signal/batch_coverage_20/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.014939823932945729,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.014939823932945729,
"signal/batch_coverage_25/centered_abs_mean": 0.15058391988277436,
"signal/batch_coverage_25/group_std_mean": 0.2111032247543335,
"signal/batch_coverage_25/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015058392658829689,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015058392658829689,
"signal/batch_coverage_5/centered_abs_mean": 0.12164954096078873,
"signal/batch_coverage_5/group_std_mean": 0.1719556510448456,
"signal/batch_coverage_5/group_zero_std_frac": 0.008333333395421505,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012164954654872417,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012164954654872417,
"signal/brier_reward/centered_abs_mean": 0.10298076122999192,
"signal/brier_reward/group_std_mean": 0.14395154118537903,
"signal/brier_reward/group_zero_std_frac": 0.008333333395421505,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.010298076085746288,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.010298076085746288,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07968108505010604,
"signal/confidence_uniqueness_reward/group_std_mean": 0.10596004277467727,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.008333333395421505,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.007968108542263508,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007968108542263508,
"signal/format_reward/centered_abs_mean": 0.012679036241024733,
"signal/format_reward/group_std_mean": 0.02613435760140419,
"signal/format_reward/group_zero_std_frac": 0.8833333373069763,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.006339518120512366,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.006339518120512366,
"signal/frontier_aurc_reward/centered_abs_mean": 0.0008092402247712016,
"signal/frontier_aurc_reward/group_std_mean": 0.001309644477441907,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.0115502846019808e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.0115502846019808e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.006613474618643522,
"signal/frontier_ece_reward/group_std_mean": 0.009865978732705116,
"signal/frontier_ece_reward/group_zero_std_frac": 0.033333334140479565,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0006613474688492715,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0006613474688492715,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.26013609766960144,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.3299739480018616,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.18888888955116273,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.026013610139489173,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.026013610139489173,
"step": 205
},
{
"calibration/aurc": 0.07137097713366734,
"calibration/batch_distribution_entropy": 0.5610990286976337,
"calibration/buffer_distribution_entropy": 0.6644328642125057,
"calibration/confidence_entropy": 0.20935073579914096,
"calibration/coverage@0%": 0.06613756613756613,
"calibration/coverage@1%": 0.10846560846560847,
"calibration/coverage@10%": 0.7219348181390065,
"calibration/coverage@15%": 0.8911419522239766,
"calibration/coverage@20%": 0.9341302159801287,
"calibration/coverage@25%": 0.9753640451720731,
"calibration/coverage@30%": 1.0,
"calibration/coverage@5%": 0.5699630558787409,
"calibration/ece": 0.10638484090913493,
"calibration/mean_confidence": 0.7933820565499561,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00708912037037035,
"completions/max_length": 3432.0,
"completions/max_terminated_length": 3432.0,
"completions/mean_length": 1000.5524088541666,
"completions/mean_terminated_length": 1007.8630981445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.49919376007799904,
"num_tokens": 527744326.0,
"reward": 1.3347638448079426,
"reward_std": 0.17598708967367807,
"rewards/accuracy_reward": 0.6691261529922485,
"rewards/batch_coverage_0": 0.45231754581133526,
"rewards/batch_coverage_1": 0.45231754581133526,
"rewards/batch_coverage_10": 0.5946024060249329,
"rewards/batch_coverage_15": 0.6113624970118204,
"rewards/batch_coverage_20": 0.6223725279172262,
"rewards/batch_coverage_25": 0.6295110781987509,
"rewards/batch_coverage_5": 0.5561538636684418,
"rewards/brier_reward": 0.8604856729507446,
"rewards/confidence_uniqueness_reward": 0.8426164587338766,
"rewards/format_reward": 0.9929108818372091,
"rewards/frontier_aurc_reward": -0.0009377729729749262,
"rewards/frontier_ece_reward": 0.005627031127611796,
"rewards/frontier_entropy_batch_reward": -0.5897964437802633,
"signal/accuracy_reward/centered_abs_mean": 0.14289460331201553,
"signal/accuracy_reward/group_std_mean": 0.19061768054962158,
"signal/accuracy_reward/group_zero_std_frac": 0.4444444477558136,
"signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07144730165600777,
"signal/accuracy_reward/weight": 0.5,
"signal/accuracy_reward/weighted_centered_abs_mean": 0.07144730165600777,
"signal/advantage_abs_mean": 0.12517134100198746,
"signal/advantage_pre_scale_abs_mean": 0.12517134100198746,
"signal/advantage_pre_scale_std": 0.2169480174779892,
"signal/advantage_std": 0.2169480174779892,
"signal/batch_coverage_0/centered_abs_mean": 0.11347770194212596,
"signal/batch_coverage_0/group_std_mean": 0.15858340760072073,
"signal/batch_coverage_0/group_zero_std_frac": 0.0,
"signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.01134777038047711,
"signal/batch_coverage_0/weight": 0.10000000149011612,
"signal/batch_coverage_0/weighted_centered_abs_mean": 0.01134777038047711,
"signal/batch_coverage_1/centered_abs_mean": 0.11347770194212596,
"signal/batch_coverage_1/group_std_mean": 0.15858340760072073,
"signal/batch_coverage_1/group_zero_std_frac": 0.0,
"signal/batch_coverage_1/scaled_weighted_centered_abs_mean": 0.01134777038047711,
"signal/batch_coverage_1/weight": 0.10000000149011612,
"signal/batch_coverage_1/weighted_centered_abs_mean": 0.01134777038047711,
"signal/batch_coverage_10/centered_abs_mean": 0.13170498609542847,
"signal/batch_coverage_10/group_std_mean": 0.19124586880207062,
"signal/batch_coverage_10/group_zero_std_frac": 0.0,
"signal/batch_coverage_10/scaled_weighted_centered_abs_mean": 0.013170498423278332,
"signal/batch_coverage_10/weight": 0.10000000149011612,
"signal/batch_coverage_10/weighted_centered_abs_mean": 0.013170498423278332,
"signal/batch_coverage_15/centered_abs_mean": 0.13884608447551727,
"signal/batch_coverage_15/group_std_mean": 0.20076180001099905,
"signal/batch_coverage_15/group_zero_std_frac": 0.0,
"signal/batch_coverage_15/scaled_weighted_centered_abs_mean": 0.013884608633816242,
"signal/batch_coverage_15/weight": 0.10000000149011612,
"signal/batch_coverage_15/weighted_centered_abs_mean": 0.013884608633816242,
"signal/batch_coverage_20/centered_abs_mean": 0.14567939937114716,
"signal/batch_coverage_20/group_std_mean": 0.2094889134168625,
"signal/batch_coverage_20/group_zero_std_frac": 0.0,
"signal/batch_coverage_20/scaled_weighted_centered_abs_mean": 0.01456794049590826,
"signal/batch_coverage_20/weight": 0.10000000149011612,
"signal/batch_coverage_20/weighted_centered_abs_mean": 0.01456794049590826,
"signal/batch_coverage_25/centered_abs_mean": 0.15283007423082987,
"signal/batch_coverage_25/group_std_mean": 0.21731575826803842,
"signal/batch_coverage_25/group_zero_std_frac": 0.0,
"signal/batch_coverage_25/scaled_weighted_centered_abs_mean": 0.015283007795612017,
"signal/batch_coverage_25/weight": 0.10000000149011612,
"signal/batch_coverage_25/weighted_centered_abs_mean": 0.015283007795612017,
"signal/batch_coverage_5/centered_abs_mean": 0.1270467018087705,
"signal/batch_coverage_5/group_std_mean": 0.18080047766367593,
"signal/batch_coverage_5/group_zero_std_frac": 0.0,
"signal/batch_coverage_5/scaled_weighted_centered_abs_mean": 0.012704670739670595,
"signal/batch_coverage_5/weight": 0.10000000149011612,
"signal/batch_coverage_5/weighted_centered_abs_mean": 0.012704670739670595,
"signal/brier_reward/centered_abs_mean": 0.11329849809408188,
"signal/brier_reward/group_std_mean": 0.15196349223454794,
"signal/brier_reward/group_zero_std_frac": 0.0,
"signal/brier_reward/scaled_weighted_centered_abs_mean": 0.011329849561055502,
"signal/brier_reward/weight": 0.10000000149011612,
"signal/brier_reward/weighted_centered_abs_mean": 0.011329849561055502,
"signal/confidence_uniqueness_reward/centered_abs_mean": 0.07130381713310878,
"signal/confidence_uniqueness_reward/group_std_mean": 0.09505186229944229,
"signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0,
"signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0071303822721044225,
"signal/confidence_uniqueness_reward/weight": 0.10000000149011612,
"signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0071303822721044225,
"signal/format_reward/centered_abs_mean": 0.012776693018774191,
"signal/format_reward/group_std_mean": 0.027781900639335316,
"signal/format_reward/group_zero_std_frac": 0.8703703681627909,
"signal/format_reward/scaled_weighted_centered_abs_mean": 0.0063883465093870955,
"signal/format_reward/weight": 0.5,
"signal/format_reward/weighted_centered_abs_mean": 0.0063883465093870955,
"signal/frontier_aurc_reward/centered_abs_mean": 0.000829536720023801,
"signal/frontier_aurc_reward/group_std_mean": 0.0013354160667707522,
"signal/frontier_aurc_reward/group_zero_std_frac": 0.0,
"signal/frontier_aurc_reward/scaled_weighted_centered_abs_mean": 1.0369208666816121e-05,
"signal/frontier_aurc_reward/weight": 0.012500000186264515,
"signal/frontier_aurc_reward/weighted_centered_abs_mean": 1.0369208666816121e-05,
"signal/frontier_ece_reward/centered_abs_mean": 0.007324707694351673,
"signal/frontier_ece_reward/group_std_mean": 0.010262228238085905,
"signal/frontier_ece_reward/group_zero_std_frac": 0.013888888992369175,
"signal/frontier_ece_reward/scaled_weighted_centered_abs_mean": 0.0007324707500326136,
"signal/frontier_ece_reward/weight": 0.10000000149011612,
"signal/frontier_ece_reward/weighted_centered_abs_mean": 0.0007324707500326136,
"signal/frontier_entropy_batch_reward/centered_abs_mean": 0.26471273104349774,
"signal/frontier_entropy_batch_reward/group_std_mean": 0.33471423387527466,
"signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.16666666666666666,
"signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.02647127149005731,
"signal/frontier_entropy_batch_reward/weight": 0.10000000149011612,
"signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.02647127149005731,
"step": 208,
"total_flos": 0.0,
"train_loss": -0.008862279910737505,
"train_runtime": 43390.7211,
"train_samples_per_second": 0.346,
"train_steps_per_second": 0.005
}
],
"logging_steps": 5,
"max_steps": 208,
"num_input_tokens_seen": 527744326,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}