{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.499193760077999, "eval_steps": 50, "global_step": 1040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calibration/aurc": 0.5003566961335364, "calibration/batch_distribution_entropy": 0.25624920600381274, "calibration/confidence_entropy": 0.21391736590893057, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/ece": 0.46778471737286403, "calibration/mean_confidence": 0.9186445387209415, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01961805555555558, "completions/max_length": 4010.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 514.2163208007812, "completions/mean_terminated_length": 524.513525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.011999850001874977, "grad_norm": 0.005415355321019888, "learning_rate": 1.201923076923077e-07, "loss": 0.0045, "num_tokens": 9037980.0, "reward": 0.5837182879447937, "reward_std": 0.5405927896499634, "rewards/accuracy_reward": 0.2587673544883728, "rewards/brier_reward": 0.30907942056655885, "rewards/confidence_one_or_zero": 0.35460069179534914, "rewards/format_reward": 0.5995659589767456, "rewards/mean_confidence_reward": 0.835715651512146, "signal/accuracy_reward/centered_abs_mean": 0.3080674946308136, "signal/accuracy_reward/group_std_mean": 0.3671301305294037, "signal/accuracy_reward/group_zero_std_frac": 0.09444444552063942, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1540337473154068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1540337473154068, "signal/advantage_abs_mean": 0.46336851119995115, "signal/advantage_pre_scale_abs_mean": 0.46336851119995115, "signal/advantage_pre_scale_std": 0.5488831758499145, "signal/advantage_std": 0.5488831758499145, "signal/brier_reward/centered_abs_mean": 0.3180743157863617, "signal/brier_reward/group_std_mean": 0.37152485847473143, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15903715789318085, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.15903715789318085, "signal/confidence_one_or_zero/centered_abs_mean": 0.414599609375, "signal/confidence_one_or_zero/group_std_mean": 0.45943390727043154, "signal/confidence_one_or_zero/group_zero_std_frac": 0.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.145995808357838e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.145995808357838e-06, "signal/format_reward/centered_abs_mean": 0.4385796427726746, "signal/format_reward/group_std_mean": 0.4739511787891388, "signal/format_reward/group_zero_std_frac": 0.0, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.2192898213863373, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.2192898213863373, "signal/mean_confidence_reward/centered_abs_mean": 0.16667295396327972, "signal/mean_confidence_reward/group_std_mean": 0.24374662935733796, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.6667295085426304e-06, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 1.6667295085426304e-06, "step": 5 }, { "calibration/aurc": 0.4675468398107022, "calibration/batch_distribution_entropy": 0.2699707591373183, "calibration/confidence_entropy": 0.21517325158335643, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/ece": 0.44461420729414314, "calibration/mean_confidence": 0.9199737089100205, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018142361111111095, "completions/max_length": 4011.4, "completions/max_terminated_length": 4011.4, "completions/mean_length": 499.46025390625, "completions/mean_terminated_length": 508.78004150390626, "completions/min_length": 0.0, "completions/min_terminated_length": 2.4, "epoch": 0.023999700003749954, "grad_norm": 0.005066287238150835, "learning_rate": 2.403846153846154e-07, "loss": 0.0031, "num_tokens": 17874482.0, "reward": 0.6011599421501159, "reward_std": 0.5348963379859925, "rewards/accuracy_reward": 0.26406250298023226, "rewards/brier_reward": 0.31731386184692384, "rewards/confidence_one_or_zero": 0.33776041865348816, "rewards/format_reward": 0.6209201335906982, "rewards/mean_confidence_reward": 0.831798505783081, "signal/accuracy_reward/centered_abs_mean": 0.3103407084941864, "signal/accuracy_reward/group_std_mean": 0.3712883353233337, "signal/accuracy_reward/group_zero_std_frac": 0.08611111268401146, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1551703542470932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1551703542470932, "signal/advantage_abs_mean": 0.4525578796863556, "signal/advantage_pre_scale_abs_mean": 0.4525578796863556, "signal/advantage_pre_scale_std": 0.5419019460678101, "signal/advantage_std": 0.5419019460678101, "signal/brier_reward/centered_abs_mean": 0.31428545117378237, "signal/brier_reward/group_std_mean": 0.3695895433425903, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15714272558689119, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.15714272558689119, "signal/confidence_one_or_zero/centered_abs_mean": 0.40955403447151184, "signal/confidence_one_or_zero/group_std_mean": 0.45571841597557067, "signal/confidence_one_or_zero/group_zero_std_frac": 0.002777777798473835, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.095540316484403e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.095540316484403e-06, "signal/format_reward/centered_abs_mean": 0.42283528447151186, "signal/format_reward/group_std_mean": 0.46479352116584777, "signal/format_reward/group_zero_std_frac": 0.0, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.21141764223575593, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.21141764223575593, "signal/mean_confidence_reward/centered_abs_mean": 0.16369045972824098, "signal/mean_confidence_reward/group_std_mean": 0.238270166516304, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.6369045169994934e-06, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 1.6369045169994934e-06, "step": 10 }, { "calibration/aurc": 0.5186979879013126, "calibration/batch_distribution_entropy": 0.2540334455268813, "calibration/confidence_entropy": 0.22228421544234456, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/ece": 0.5040791119398614, "calibration/mean_confidence": 0.9189297206705145, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.020138888888888862, "completions/max_length": 4040.4, "completions/max_terminated_length": 4040.4, "completions/mean_length": 506.28428344726564, "completions/mean_terminated_length": 516.7885192871094, "completions/min_length": 0.0, "completions/min_terminated_length": 6.8, "epoch": 0.03599955000562493, "grad_norm": 0.0038907676935195923, "learning_rate": 3.6057692307692306e-07, "loss": 0.0048, "num_tokens": 26808861.0, "reward": 0.5788660883903504, "reward_std": 0.5162944972515107, "rewards/accuracy_reward": 0.23628472089767455, "rewards/brier_reward": 0.2952092230319977, "rewards/confidence_one_or_zero": 0.3163194417953491, "rewards/format_reward": 0.6262152791023254, "rewards/mean_confidence_reward": 0.8313595175743103, "signal/accuracy_reward/centered_abs_mean": 0.2911892354488373, "signal/accuracy_reward/group_std_mean": 0.35461498498916627, "signal/accuracy_reward/group_zero_std_frac": 0.10555555820465087, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.14559461772441865, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.14559461772441865, "signal/advantage_abs_mean": 0.43306326270103457, "signal/advantage_pre_scale_abs_mean": 0.43306326270103457, "signal/advantage_pre_scale_std": 0.5246892929077148, "signal/advantage_std": 0.5246892929077148, "signal/brier_reward/centered_abs_mean": 0.2995938897132874, "signal/brier_reward/group_std_mean": 0.3567161738872528, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1497969448566437, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.1497969448566437, "signal/confidence_one_or_zero/centered_abs_mean": 0.39108072519302367, "signal/confidence_one_or_zero/group_std_mean": 0.4443080723285675, "signal/confidence_one_or_zero/group_zero_std_frac": 0.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.9108071632654175e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.9108071632654175e-06, "signal/format_reward/centered_abs_mean": 0.41809895634651184, "signal/format_reward/group_std_mean": 0.46151856780052186, "signal/format_reward/group_zero_std_frac": 0.0, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.20904947817325592, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.20904947817325592, "signal/mean_confidence_reward/centered_abs_mean": 0.16398017406463622, "signal/mean_confidence_reward/group_std_mean": 0.23571294844150542, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.639801712371991e-06, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 1.639801712371991e-06, "step": 15 }, { "calibration/aurc": 0.5209640811120212, "calibration/batch_distribution_entropy": 0.2477626445824733, "calibration/confidence_entropy": 0.22264009410365607, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/ece": 0.4889315116331372, "calibration/mean_confidence": 0.9216075469808418, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01987847222222223, "completions/max_length": 3925.8, "completions/max_terminated_length": 3925.8, "completions/mean_length": 484.0993896484375, "completions/mean_terminated_length": 494.1028747558594, "completions/min_length": 0.0, "completions/min_terminated_length": 20.8, "epoch": 0.04799940000749991, "grad_norm": 0.013253336772322655, "learning_rate": 4.807692307692308e-07, "loss": 0.0041, "num_tokens": 35499382.0, "reward": 0.6855281472206116, "reward_std": 0.49826231598854065, "rewards/accuracy_reward": 0.28315972089767455, "rewards/brier_reward": 0.3515884280204773, "rewards/confidence_one_or_zero": 0.3247395813465118, "rewards/format_reward": 0.7362847328186035, "rewards/mean_confidence_reward": 0.8484625816345215, "signal/accuracy_reward/centered_abs_mean": 0.3103190064430237, "signal/accuracy_reward/group_std_mean": 0.36804171204566954, "signal/accuracy_reward/group_zero_std_frac": 0.10555555894970894, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.15515950322151184, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.15515950322151184, "signal/advantage_abs_mean": 0.41278480887413027, "signal/advantage_pre_scale_abs_mean": 0.41278480887413027, "signal/advantage_pre_scale_std": 0.5071763217449188, "signal/advantage_std": 0.5071763217449188, "signal/brier_reward/centered_abs_mean": 0.31032766699790953, "signal/brier_reward/group_std_mean": 0.36342496871948243, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15516383349895477, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.15516383349895477, "signal/confidence_one_or_zero/centered_abs_mean": 0.3996690511703491, "signal/confidence_one_or_zero/group_std_mean": 0.45010300278663634, "signal/confidence_one_or_zero/group_zero_std_frac": 0.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.996690429630689e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.996690429630689e-06, "signal/format_reward/centered_abs_mean": 0.34467231035232543, "signal/format_reward/group_std_mean": 0.41416847705841064, "signal/format_reward/group_zero_std_frac": 0.00555555559694767, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.17233615517616271, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.17233615517616271, "signal/mean_confidence_reward/centered_abs_mean": 0.14559868574142457, "signal/mean_confidence_reward/group_std_mean": 0.21428031325340272, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.4559868304786505e-06, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 1.4559868304786505e-06, "step": 20 }, { "calibration/aurc": 0.5226141912501454, "calibration/batch_distribution_entropy": 0.26392980975536445, "calibration/confidence_entropy": 0.22660199716355445, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/ece": 0.49068137185722993, "calibration/mean_confidence": 0.9186960370091383, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015017361111111138, "completions/max_length": 3907.6, "completions/max_terminated_length": 3907.6, "completions/mean_length": 438.3477416992188, "completions/mean_terminated_length": 445.0595458984375, "completions/min_length": 0.0, "completions/min_terminated_length": 29.2, "epoch": 0.05999925000937488, "grad_norm": 0.002081266837194562, "learning_rate": 6.009615384615385e-07, "loss": -0.0057, "num_tokens": 43673596.0, "reward": 0.7870225310325623, "reward_std": 0.4548142373561859, "rewards/accuracy_reward": 0.3155381977558136, "rewards/brier_reward": 0.40136520862579345, "rewards/confidence_one_or_zero": 0.3147569477558136, "rewards/format_reward": 0.8571180582046509, "rewards/mean_confidence_reward": 0.8663933753967286, "signal/accuracy_reward/centered_abs_mean": 0.3275119364261627, "signal/accuracy_reward/group_std_mean": 0.38190675973892213, "signal/accuracy_reward/group_zero_std_frac": 0.0916666690260172, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.16375596821308136, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.16375596821308136, "signal/advantage_abs_mean": 0.37549259066581725, "signal/advantage_pre_scale_abs_mean": 0.37549259066581725, "signal/advantage_pre_scale_std": 0.4622957110404968, "signal/advantage_std": 0.4622957110404968, "signal/brier_reward/centered_abs_mean": 0.3130465567111969, "signal/brier_reward/group_std_mean": 0.36286460161209105, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15652327835559846, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.15652327835559846, "signal/confidence_one_or_zero/centered_abs_mean": 0.3908528685569763, "signal/confidence_one_or_zero/group_std_mean": 0.443654865026474, "signal/confidence_one_or_zero/group_zero_std_frac": 0.002777777798473835, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.90852846976486e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.90852846976486e-06, "signal/format_reward/centered_abs_mean": 0.21771918535232543, "signal/format_reward/group_std_mean": 0.3091414660215378, "signal/format_reward/group_zero_std_frac": 0.07500000130385161, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.10885959267616271, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.10885959267616271, "signal/mean_confidence_reward/centered_abs_mean": 0.12545316964387893, "signal/mean_confidence_reward/group_std_mean": 0.18768543004989624, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.2545316621981328e-06, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 1.2545316621981328e-06, "step": 25 }, { "calibration/aurc": 0.5141087220689148, "calibration/batch_distribution_entropy": 0.32289542645647246, "calibration/confidence_entropy": 0.2506993069575317, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/ece": 0.46583421801217406, "calibration/mean_confidence": 0.9045510415868986, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012239583333333326, "completions/max_length": 3929.2, "completions/max_terminated_length": 3929.2, "completions/mean_length": 423.99583740234374, "completions/mean_terminated_length": 429.28956298828126, "completions/min_length": 0.0, "completions/min_terminated_length": 60.8, "epoch": 0.07199910001124986, "grad_norm": 0.0031626673880964518, "learning_rate": 7.211538461538461e-07, "loss": -0.0127, "num_tokens": 51667948.0, "reward": 0.8738817811012268, "reward_std": 0.4030775785446167, "rewards/accuracy_reward": 0.35173611640930175, "rewards/brier_reward": 0.4531223654747009, "rewards/confidence_one_or_zero": 0.27413194477558134, "rewards/format_reward": 0.9428819417953491, "rewards/mean_confidence_reward": 0.8835186243057251, "signal/accuracy_reward/centered_abs_mean": 0.32495659589767456, "signal/accuracy_reward/group_std_mean": 0.3823892056941986, "signal/accuracy_reward/group_zero_std_frac": 0.08611111417412758, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.16247829794883728, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.16247829794883728, "signal/advantage_abs_mean": 0.33052992820739746, "signal/advantage_pre_scale_abs_mean": 0.33052992820739746, "signal/advantage_pre_scale_std": 0.4124049127101898, "signal/advantage_std": 0.4124049127101898, "signal/brier_reward/centered_abs_mean": 0.29936038255691527, "signal/brier_reward/group_std_mean": 0.3517481565475464, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.14968019127845764, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.14968019127845764, "signal/confidence_one_or_zero/centered_abs_mean": 0.3557074725627899, "signal/confidence_one_or_zero/group_std_mean": 0.4193476617336273, "signal/confidence_one_or_zero/group_zero_std_frac": 0.008333333395421505, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.557074660420767e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.557074660420767e-06, "signal/format_reward/centered_abs_mean": 0.09854600578546524, "signal/format_reward/group_std_mean": 0.18272685110569, "signal/format_reward/group_zero_std_frac": 0.27777778208255766, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.04927300289273262, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.04927300289273262, "signal/mean_confidence_reward/centered_abs_mean": 0.09601933360099793, "signal/mean_confidence_reward/group_std_mean": 0.1503779798746109, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 9.601933356861992e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 9.601933356861992e-07, "step": 30 }, { "calibration/aurc": 0.4972522109347592, "calibration/batch_distribution_entropy": 0.3302711933746244, "calibration/confidence_entropy": 0.2776537527426556, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/ece": 0.41930328878727413, "calibration/mean_confidence": 0.8953105409806161, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012586805555555558, "completions/max_length": 3699.4, "completions/max_terminated_length": 3699.4, "completions/mean_length": 448.37916870117186, "completions/mean_terminated_length": 454.13587646484376, "completions/min_length": 0.0, "completions/min_terminated_length": 72.8, "epoch": 0.08399895001312484, "grad_norm": 0.001993843587115407, "learning_rate": 8.41346153846154e-07, "loss": -0.0133, "num_tokens": 59910716.0, "reward": 0.9582816243171692, "reward_std": 0.37053838968276975, "rewards/accuracy_reward": 0.41666666269302366, "rewards/brier_reward": 0.5245276629924774, "rewards/confidence_one_or_zero": 0.20017361342906953, "rewards/format_reward": 0.9753472089767456, "rewards/mean_confidence_reward": 0.8809749722480774, "signal/accuracy_reward/centered_abs_mean": 0.3172743022441864, "signal/accuracy_reward/group_std_mean": 0.3770745456218719, "signal/accuracy_reward/group_zero_std_frac": 0.08055555820465088, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1586371511220932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1586371511220932, "signal/advantage_abs_mean": 0.3055282711982727, "signal/advantage_pre_scale_abs_mean": 0.3055282711982727, "signal/advantage_pre_scale_std": 0.3821408569812775, "signal/advantage_std": 0.3821408569812775, "signal/brier_reward/centered_abs_mean": 0.2819680333137512, "signal/brier_reward/group_std_mean": 0.3350661635398865, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1409840166568756, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.1409840166568756, "signal/confidence_one_or_zero/centered_abs_mean": 0.2884765684604645, "signal/confidence_one_or_zero/group_std_mean": 0.3684383273124695, "signal/confidence_one_or_zero/group_zero_std_frac": 0.0361111119389534, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 2.884765626731678e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 2.884765626731678e-06, "signal/format_reward/centered_abs_mean": 0.04397786483168602, "signal/format_reward/group_std_mean": 0.09554313570261001, "signal/format_reward/group_zero_std_frac": 0.5583333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.02198893241584301, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.02198893241584301, "signal/mean_confidence_reward/centered_abs_mean": 0.08512969464063644, "signal/mean_confidence_reward/group_std_mean": 0.12882178872823716, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.512969316143426e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.512969316143426e-07, "step": 35 }, { "calibration/aurc": 0.46223054679039804, "calibration/batch_distribution_entropy": 0.37872885716176546, "calibration/confidence_entropy": 0.3156751490975468, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0525130890052356, "calibration/coverage@30%": 0.0525130890052356, "calibration/coverage@5%": 0.0, "calibration/ece": 0.38400354955456084, "calibration/mean_confidence": 0.8831874564885155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010763888888888906, "completions/max_length": 3989.8, "completions/max_terminated_length": 3989.8, "completions/mean_length": 483.07899780273436, "completions/mean_terminated_length": 488.3682434082031, "completions/min_length": 0.0, "completions/min_terminated_length": 85.4, "epoch": 0.09599880001499982, "grad_norm": 0.001518145203590393, "learning_rate": 9.615384615384617e-07, "loss": -0.0075, "num_tokens": 68595306.0, "reward": 1.0371843338012696, "reward_std": 0.34998282194137575, "rewards/accuracy_reward": 0.4918402791023254, "rewards/brier_reward": 0.5990880489349365, "rewards/confidence_one_or_zero": 0.13880208432674407, "rewards/format_reward": 0.9834201335906982, "rewards/mean_confidence_reward": 0.8695887327194214, "signal/accuracy_reward/centered_abs_mean": 0.306391054391861, "signal/accuracy_reward/group_std_mean": 0.3689858078956604, "signal/accuracy_reward/group_zero_std_frac": 0.08888888992369175, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1531955271959305, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1531955271959305, "signal/advantage_abs_mean": 0.2849509596824646, "signal/advantage_pre_scale_abs_mean": 0.2849509596824646, "signal/advantage_pre_scale_std": 0.3639970779418945, "signal/advantage_std": 0.3639970779418945, "signal/brier_reward/centered_abs_mean": 0.2586053818464279, "signal/brier_reward/group_std_mean": 0.3124267339706421, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.12930269092321395, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.12930269092321395, "signal/confidence_one_or_zero/centered_abs_mean": 0.21510959267616273, "signal/confidence_one_or_zero/group_std_mean": 0.30543968081474304, "signal/confidence_one_or_zero/group_zero_std_frac": 0.0916666679084301, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 2.1510959186343826e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 2.1510959186343826e-06, "signal/format_reward/centered_abs_mean": 0.03066948838531971, "signal/format_reward/group_std_mean": 0.06838164255023002, "signal/format_reward/group_zero_std_frac": 0.6833333492279052, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.015334744192659854, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.015334744192659854, "signal/mean_confidence_reward/centered_abs_mean": 0.08165367096662521, "signal/mean_confidence_reward/group_std_mean": 0.12294813990592957, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.165366693901888e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.165366693901888e-07, "step": 40 }, { "calibration/aurc": 0.3632523722078428, "calibration/batch_distribution_entropy": 0.40014022467776555, "calibration/confidence_entropy": 0.33649726670302316, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.010869565217391304, "calibration/coverage@30%": 0.31278023181698555, "calibration/coverage@5%": 0.0, "calibration/ece": 0.28071032592236855, "calibration/mean_confidence": 0.8741442428679134, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4014.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 537.7995727539062, "completions/mean_terminated_length": 544.241943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 111.6, "epoch": 0.1079986500168748, "grad_norm": 0.0014352889265865088, "learning_rate": 1.0817307692307693e-06, "loss": -0.0081, "num_tokens": 77926021.0, "reward": 1.0762129545211792, "reward_std": 0.31938507556915285, "rewards/accuracy_reward": 0.5282118022441864, "rewards/brier_reward": 0.6402541995048523, "rewards/confidence_one_or_zero": 0.09305555522441863, "rewards/format_reward": 0.9839409708976745, "rewards/mean_confidence_reward": 0.85385502576828, "signal/accuracy_reward/centered_abs_mean": 0.27456054389476775, "signal/accuracy_reward/group_std_mean": 0.34297096729278564, "signal/accuracy_reward/group_zero_std_frac": 0.10277777835726738, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.13728027194738388, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.13728027194738388, "signal/advantage_abs_mean": 0.2513105541467667, "signal/advantage_pre_scale_abs_mean": 0.2513105541467667, "signal/advantage_pre_scale_std": 0.33808977007865904, "signal/advantage_std": 0.33808977007865904, "signal/brier_reward/centered_abs_mean": 0.22321872115135194, "signal/brier_reward/group_std_mean": 0.27992293834686277, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.11160936057567597, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.11160936057567597, "signal/confidence_one_or_zero/centered_abs_mean": 0.1535481780767441, "signal/confidence_one_or_zero/group_std_mean": 0.24207101464271547, "signal/confidence_one_or_zero/group_zero_std_frac": 0.1833333358168602, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.5354817151091993e-06, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.5354817151091993e-06, "signal/format_reward/centered_abs_mean": 0.02777235247194767, "signal/format_reward/group_std_mean": 0.05812191069126129, "signal/format_reward/group_zero_std_frac": 0.7361111164093017, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.013886176235973834, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.013886176235973834, "signal/mean_confidence_reward/centered_abs_mean": 0.0813577339053154, "signal/mean_confidence_reward/group_std_mean": 0.11790282279253006, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.135773327921925e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.135773327921925e-07, "step": 45 }, { "calibration/aurc": 0.4560983215034666, "calibration/batch_distribution_entropy": 0.46742343392517716, "calibration/confidence_entropy": 0.3861066400114651, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.00797872340425532, "calibration/coverage@25%": 0.036550151975683884, "calibration/coverage@30%": 0.05037993920972644, "calibration/coverage@5%": 0.0, "calibration/ece": 0.3575551210969322, "calibration/mean_confidence": 0.8537136628535116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013194444444444443, "completions/max_length": 4016.2, "completions/max_terminated_length": 4016.2, "completions/mean_length": 587.3158081054687, "completions/mean_terminated_length": 595.2116821289062, "completions/min_length": 0.0, "completions/min_terminated_length": 123.2, "epoch": 0.11999850001874976, "grad_norm": 0.0010600859532132745, "learning_rate": 1.201923076923077e-06, "loss": -0.005, "num_tokens": 87789499.0, "reward": 1.113412570953369, "reward_std": 0.2909988582134247, "rewards/accuracy_reward": 0.5704861283302307, "rewards/brier_reward": 0.6749843955039978, "rewards/confidence_one_or_zero": 0.04965277686715126, "rewards/format_reward": 0.9813368082046509, "rewards/mean_confidence_reward": 0.8376475691795349, "signal/accuracy_reward/centered_abs_mean": 0.24557291567325593, "signal/accuracy_reward/group_std_mean": 0.3080396592617035, "signal/accuracy_reward/group_zero_std_frac": 0.19166666865348816, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.12278645783662796, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.12278645783662796, "signal/advantage_abs_mean": 0.2273057609796524, "signal/advantage_pre_scale_abs_mean": 0.2273057609796524, "signal/advantage_pre_scale_std": 0.3210273563861847, "signal/advantage_std": 0.3210273563861847, "signal/brier_reward/centered_abs_mean": 0.19555461704730986, "signal/brier_reward/group_std_mean": 0.24684284925460814, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09777730852365493, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.09777730852365493, "signal/confidence_one_or_zero/centered_abs_mean": 0.08498263955116273, "signal/confidence_one_or_zero/group_std_mean": 0.15688682496547698, "signal/confidence_one_or_zero/group_zero_std_frac": 0.38055555820465087, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 8.498263582623622e-07, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 8.498263582623622e-07, "signal/format_reward/centered_abs_mean": 0.03173285573720932, "signal/format_reward/group_std_mean": 0.05944188237190247, "signal/format_reward/group_zero_std_frac": 0.7583333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01586642786860466, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01586642786860466, "signal/mean_confidence_reward/centered_abs_mean": 0.0822438582777977, "signal/mean_confidence_reward/group_std_mean": 0.11605418920516967, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.224385169341986e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.224385169341986e-07, "step": 50 }, { "epoch": 0.11999850001874976, "eval_calibration/aurc": 0.331946079147354, "eval_calibration/batch_distribution_entropy": 0.4968428063828032, "eval_calibration/confidence_entropy": 0.4144328386703737, "eval_calibration/coverage@0%": 0.020833333333333332, "eval_calibration/coverage@1%": 0.020833333333333332, "eval_calibration/coverage@10%": 0.020833333333333332, "eval_calibration/coverage@15%": 0.08333333333333333, "eval_calibration/coverage@20%": 0.15625, "eval_calibration/coverage@25%": 0.5243055555555556, "eval_calibration/coverage@30%": 0.5833333333333334, "eval_calibration/coverage@5%": 0.020833333333333332, "eval_calibration/ece": 0.20076388888888888, "eval_calibration/mean_confidence": 0.8347569444444445, "eval_completions/clipped_ratio": 0.010416666666666666, "eval_completions/max_length": 2485.5, "eval_completions/max_terminated_length": 2485.5, "eval_completions/mean_length": 595.9890747070312, "eval_completions/mean_terminated_length": 602.2484334309896, "eval_completions/min_length": 69.16666666666667, "eval_completions/min_terminated_length": 191.16666666666666, "eval_loss": 0.0, "eval_num_tokens": 87789499.0, "eval_reward": 1.1306366324424744, "eval_reward_std": 0.425055851538976, "eval_rewards/accuracy_reward": 0.5833333333333334, "eval_rewards/brier_reward": 0.6926797727743784, "eval_rewards/confidence_one_or_zero": 0.033854167287548385, "eval_rewards/format_reward": 0.9852430621782938, "eval_rewards/mean_confidence_reward": 0.8183419903119405, "eval_runtime": 200.5667, "eval_samples_per_second": 4.986, "eval_signal/accuracy_reward/centered_abs_mean": 0.4698350677887599, "eval_signal/accuracy_reward/group_std_mean": 0.4921261817216873, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.23491753389437994, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.23491753389437994, "eval_signal/advantage_abs_mean": 0.3960884014765422, "eval_signal/advantage_pre_scale_abs_mean": 0.3960884014765422, "eval_signal/advantage_pre_scale_std": 0.42062007387479144, "eval_signal/advantage_std": 0.42062007387479144, "eval_signal/brier_reward/centered_abs_mean": 0.30918586750825244, "eval_signal/brier_reward/group_std_mean": 0.34099134306112927, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15459293375412622, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.15459293375412622, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0630967877805233, "eval_signal/confidence_one_or_zero/group_std_mean": 0.1429342900713285, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 0.3333333383003871, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.309678515966274e-07, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.309678515966274e-07, "eval_signal/format_reward/centered_abs_mean": 0.02826605938995878, "eval_signal/format_reward/group_std_mean": 0.07450965698808432, "eval_signal/format_reward/group_zero_std_frac": 0.6111111243565878, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.01413302969497939, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.01413302969497939, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.09285100921988487, "eval_signal/mean_confidence_reward/group_std_mean": 0.1355661302804947, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 9.28510113832696e-07, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 9.28510113832696e-07, "eval_steps_per_second": 0.03, "step": 50 }, { "epoch": 0.11999850001874976, "step": 50, "train_probe_calibration/aurc": 0.32173371103634807, "train_probe_calibration/batch_distribution_entropy": 0.4987900164803152, "train_probe_calibration/confidence_entropy": 0.4132968808741664, "train_probe_calibration/coverage@0%": 0.03229166666666667, "train_probe_calibration/coverage@1%": 0.03229166666666667, "train_probe_calibration/coverage@10%": 0.08958333333333333, "train_probe_calibration/coverage@15%": 0.10520833333333333, "train_probe_calibration/coverage@20%": 0.19965277777777776, "train_probe_calibration/coverage@25%": 0.30062724014336917, "train_probe_calibration/coverage@30%": 0.5228494623655914, "train_probe_calibration/coverage@5%": 0.03229166666666667, "train_probe_calibration/ece": 0.23035114247311828, "train_probe_calibration/mean_confidence": 0.8356714829749105, "train_probe_completions/clipped_ratio": 0.009548611111111086, "train_probe_completions/max_length": 2617.6666666666665, "train_probe_completions/max_terminated_length": 2617.6666666666665, "train_probe_completions/mean_length": 598.6559244791666, "train_probe_completions/mean_terminated_length": 604.4992065429688, "train_probe_completions/min_length": 64.0, "train_probe_completions/min_terminated_length": 170.83333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 87789499.0, "train_probe_reward": 1.157408078511556, "train_probe_reward_std": 0.4236765404542287, "train_probe_rewards/accuracy_reward": 0.613715281089147, "train_probe_rewards/brier_reward": 0.7158407171567281, "train_probe_rewards/confidence_one_or_zero": 0.031250000543271504, "train_probe_rewards/format_reward": 0.9852430522441864, "train_probe_rewards/mean_confidence_reward": 0.8221006989479065, "train_probe_runtime": 198.0354, "train_probe_samples_per_second": 5.05, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4659830729166667, "train_probe_signal/accuracy_reward/group_std_mean": 0.49015389382839203, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.23299153645833334, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.23299153645833334, "train_probe_signal/advantage_abs_mean": 0.3936274101336797, "train_probe_signal/advantage_pre_scale_abs_mean": 0.3936274101336797, "train_probe_signal/advantage_pre_scale_std": 0.4189818451801936, "train_probe_signal/advantage_std": 0.4189818451801936, "train_probe_signal/brier_reward/centered_abs_mean": 0.305221289396286, "train_probe_signal/brier_reward/group_std_mean": 0.3369348595539729, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.152610644698143, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.152610644698143, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.05826822963232795, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.1290165629858772, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 0.416666679084301, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 5.826822852365391e-07, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 5.826822852365391e-07, "train_probe_signal/format_reward/centered_abs_mean": 0.028266058769077063, "train_probe_signal/format_reward/group_std_mean": 0.07450965760896604, "train_probe_signal/format_reward/group_zero_std_frac": 0.6111111243565878, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.014133029384538531, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.014133029384538531, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.08915634453296661, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.1314987043539683, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.915634642410927e-07, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 8.915634642410927e-07, "train_probe_steps_per_second": 0.03 }, { "calibration/aurc": 0.3556234230507609, "calibration/batch_distribution_entropy": 0.5368253933068459, "calibration/confidence_entropy": 0.4324260090406048, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.07404927248677248, "calibration/coverage@25%": 0.1667630453327255, "calibration/coverage@30%": 0.402088772845953, "calibration/coverage@5%": 0.0, "calibration/ece": 0.22780432165725464, "calibration/mean_confidence": 0.8258269191627734, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011111111111111117, "completions/max_length": 4053.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 627.1282104492187, "completions/mean_terminated_length": 634.2450073242187, "completions/min_length": 0.0, "completions/min_terminated_length": 155.0, "epoch": 0.13199835002062474, "grad_norm": 0.0013571049785241485, "learning_rate": 1.3221153846153848e-06, "loss": -0.0057, "num_tokens": 98094592.0, "reward": 1.1434667587280274, "reward_std": 0.25894117057323457, "rewards/accuracy_reward": 0.5953125119209289, "rewards/brier_reward": 0.7062745332717896, "rewards/confidence_one_or_zero": 0.019704860635101795, "rewards/format_reward": 0.9853298544883728, "rewards/mean_confidence_reward": 0.8103281259536743, "signal/accuracy_reward/centered_abs_mean": 0.2235894113779068, "signal/accuracy_reward/group_std_mean": 0.2862032353878021, "signal/accuracy_reward/group_zero_std_frac": 0.22500000298023223, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1117947056889534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1117947056889534, "signal/advantage_abs_mean": 0.19919183552265168, "signal/advantage_pre_scale_abs_mean": 0.19919183552265168, "signal/advantage_pre_scale_std": 0.2940775454044342, "signal/advantage_std": 0.2940775454044342, "signal/brier_reward/centered_abs_mean": 0.1678469717502594, "signal/brier_reward/group_std_mean": 0.2148392230272293, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0839234858751297, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0839234858751297, "signal/confidence_one_or_zero/centered_abs_mean": 0.03670247457921505, "signal/confidence_one_or_zero/group_std_mean": 0.08621180579066276, "signal/confidence_one_or_zero/group_zero_std_frac": 0.5833333373069763, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.6702474233152315e-07, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.6702474233152315e-07, "signal/format_reward/centered_abs_mean": 0.024853516183793545, "signal/format_reward/group_std_mean": 0.04575493335723877, "signal/format_reward/group_zero_std_frac": 0.8138889074325562, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012426758091896773, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012426758091896773, "signal/mean_confidence_reward/centered_abs_mean": 0.08177917897701263, "signal/mean_confidence_reward/group_std_mean": 0.10944290608167648, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.177917379725841e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.177917379725841e-07, "step": 55 }, { "calibration/aurc": 0.4046435372137346, "calibration/batch_distribution_entropy": 0.574484336749622, "calibration/confidence_entropy": 0.4760174611737183, "calibration/coverage@0%": 0.0010526388686902264, "calibration/coverage@1%": 0.0010526388686902264, "calibration/coverage@10%": 0.0010526388686902264, "calibration/coverage@15%": 0.0010526388686902264, "calibration/coverage@20%": 0.20105263886869024, "calibration/coverage@25%": 0.20105263886869024, "calibration/coverage@30%": 0.20105263886869024, "calibration/coverage@5%": 0.0010526388686902264, "calibration/ece": 0.23034133637129126, "calibration/mean_confidence": 0.7963875726865175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014930555555555558, "completions/max_length": 3982.8, "completions/max_terminated_length": 3982.8, "completions/mean_length": 651.958935546875, "completions/mean_terminated_length": 661.8682006835937, "completions/min_length": 0.0, "completions/min_terminated_length": 167.4, "epoch": 0.14399820002249972, "grad_norm": 0.0007229429902508855, "learning_rate": 1.4423076923076922e-06, "loss": -0.0079, "num_tokens": 108701735.0, "reward": 1.1390937089920044, "reward_std": 0.25963895320892333, "rewards/accuracy_reward": 0.5878472208976746, "rewards/brier_reward": 0.7088140368461608, "rewards/confidence_one_or_zero": 0.011371527798473835, "rewards/format_reward": 0.9815104246139527, "rewards/mean_confidence_reward": 0.7796067595481873, "signal/accuracy_reward/centered_abs_mean": 0.22587890326976776, "signal/accuracy_reward/group_std_mean": 0.28979323506355287, "signal/accuracy_reward/group_zero_std_frac": 0.21666666567325593, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.11293945163488388, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.11293945163488388, "signal/advantage_abs_mean": 0.19712428450584413, "signal/advantage_pre_scale_abs_mean": 0.19712428450584413, "signal/advantage_pre_scale_std": 0.2929088294506073, "signal/advantage_std": 0.2929088294506073, "signal/brier_reward/centered_abs_mean": 0.15875896513462068, "signal/brier_reward/group_std_mean": 0.20580186247825621, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07937948256731034, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.07937948256731034, "signal/confidence_one_or_zero/centered_abs_mean": 0.020035807229578494, "signal/confidence_one_or_zero/group_std_mean": 0.04319153465330601, "signal/confidence_one_or_zero/group_zero_std_frac": 0.8, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 2.0035806755913654e-07, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 2.0035806755913654e-07, "signal/format_reward/centered_abs_mean": 0.03181966096162796, "signal/format_reward/group_std_mean": 0.05989409610629082, "signal/format_reward/group_zero_std_frac": 0.7583333492279053, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01590983048081398, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01590983048081398, "signal/mean_confidence_reward/centered_abs_mean": 0.08490917384624481, "signal/mean_confidence_reward/group_std_mean": 0.11686530560255051, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.490917252856889e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.490917252856889e-07, "step": 60 }, { "calibration/aurc": 0.34525159251626814, "calibration/batch_distribution_entropy": 0.6253952245942033, "calibration/confidence_entropy": 0.5381018167275411, "calibration/coverage@0%": 0.0015748031496062992, "calibration/coverage@1%": 0.0015748031496062992, "calibration/coverage@10%": 0.0015748031496062992, "calibration/coverage@15%": 0.03569553805774278, "calibration/coverage@20%": 0.13910761154855644, "calibration/coverage@25%": 0.4191204216183852, "calibration/coverage@30%": 0.572542910473945, "calibration/coverage@5%": 0.0015748031496062992, "calibration/ece": 0.14509340351465788, "calibration/mean_confidence": 0.7448597149980478, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01328125, "completions/max_length": 3825.4, "completions/max_terminated_length": 3825.4, "completions/mean_length": 652.9483520507813, "completions/mean_terminated_length": 661.7545776367188, "completions/min_length": 0.0, "completions/min_terminated_length": 167.0, "epoch": 0.1559980500243747, "grad_norm": 0.0017221093876287341, "learning_rate": 1.5625e-06, "loss": -0.0071, "num_tokens": 119317748.0, "reward": 1.1734039068222046, "reward_std": 0.22781020700931548, "rewards/accuracy_reward": 0.621875, "rewards/brier_reward": 0.7409771919250489, "rewards/confidence_one_or_zero": 0.002517361153149977, "rewards/format_reward": 0.9839409828186035, "rewards/mean_confidence_reward": 0.7322934150695801, "signal/accuracy_reward/centered_abs_mean": 0.2032986104488373, "signal/accuracy_reward/group_std_mean": 0.2636931657791138, "signal/accuracy_reward/group_zero_std_frac": 0.26666667461395266, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.10164930522441865, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.10164930522441865, "signal/advantage_abs_mean": 0.1711862117052078, "signal/advantage_pre_scale_abs_mean": 0.1711862117052078, "signal/advantage_pre_scale_std": 0.2649614542722702, "signal/advantage_std": 0.2649614542722702, "signal/brier_reward/centered_abs_mean": 0.13180441409349442, "signal/brier_reward/group_std_mean": 0.17146021127700806, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.06590220704674721, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.06590220704674721, "signal/confidence_one_or_zero/centered_abs_mean": 0.0047797308187000455, "signal/confidence_one_or_zero/group_std_mean": 0.012312699295580386, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9361111283302307, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.779730602422205e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.779730602422205e-08, "signal/format_reward/centered_abs_mean": 0.02718641497194767, "signal/format_reward/group_std_mean": 0.05054701194167137, "signal/format_reward/group_zero_std_frac": 0.794444453716278, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.013593207485973835, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.013593207485973835, "signal/mean_confidence_reward/centered_abs_mean": 0.08394395560026169, "signal/mean_confidence_reward/group_std_mean": 0.11216964721679687, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.394395194954995e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.394395194954995e-07, "step": 65 }, { "calibration/aurc": 0.391721039437937, "calibration/batch_distribution_entropy": 0.5863541906056353, "calibration/confidence_entropy": 0.584135174856569, "calibration/coverage@0%": 0.0031845862034541273, "calibration/coverage@1%": 0.0031845862034541273, "calibration/coverage@10%": 0.0031845862034541273, "calibration/coverage@15%": 0.021174004192872117, "calibration/coverage@20%": 0.021174004192872117, "calibration/coverage@25%": 0.027980286915385205, "calibration/coverage@30%": 0.027980286915385205, "calibration/coverage@5%": 0.0031845862034541273, "calibration/ece": 0.12742719004233602, "calibration/mean_confidence": 0.703203106831504, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013020833333333325, "completions/max_length": 4019.8, "completions/max_terminated_length": 4019.8, "completions/mean_length": 639.8428955078125, "completions/mean_terminated_length": 648.2968383789063, "completions/min_length": 0.0, "completions/min_terminated_length": 151.0, "epoch": 0.16799790002624967, "grad_norm": 0.000651072186883539, "learning_rate": 1.682692307692308e-06, "loss": -0.0081, "num_tokens": 129766882.0, "reward": 1.1596927404403687, "reward_std": 0.21734236180782318, "rewards/accuracy_reward": 0.5971354126930237, "rewards/brier_reward": 0.7384687781333923, "rewards/confidence_one_or_zero": 0.0011284722539130598, "rewards/format_reward": 0.9837673544883728, "rewards/mean_confidence_reward": 0.6930816173553467, "signal/accuracy_reward/centered_abs_mean": 0.20099283754825592, "signal/accuracy_reward/group_std_mean": 0.25969791114330293, "signal/accuracy_reward/group_zero_std_frac": 0.28888889253139494, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.10049641877412796, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.10049641877412796, "signal/advantage_abs_mean": 0.16307039856910704, "signal/advantage_pre_scale_abs_mean": 0.16307039856910704, "signal/advantage_pre_scale_std": 0.2535522371530533, "signal/advantage_std": 0.2535522371530533, "signal/brier_reward/centered_abs_mean": 0.11968168914318085, "signal/brier_reward/group_std_mean": 0.15591173470020295, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.059840844571590425, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.059840844571590425, "signal/confidence_one_or_zero/centered_abs_mean": 0.002164713467936963, "signal/confidence_one_or_zero/group_std_mean": 0.00578572116792202, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9694444417953492, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 2.1647134573754557e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 2.1647134573754557e-08, "signal/format_reward/centered_abs_mean": 0.02758789099752903, "signal/format_reward/group_std_mean": 0.05219795480370522, "signal/format_reward/group_zero_std_frac": 0.7833333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.013793945498764514, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.013793945498764514, "signal/mean_confidence_reward/centered_abs_mean": 0.08134629875421524, "signal/mean_confidence_reward/group_std_mean": 0.10953120738267899, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.134629524647607e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.134629524647607e-07, "step": 70 }, { "calibration/aurc": 0.33361351654727667, "calibration/batch_distribution_entropy": 0.5256188509791653, "calibration/confidence_entropy": 0.6123191486908507, "calibration/coverage@0%": 0.008882809994961296, "calibration/coverage@1%": 0.008882809994961296, "calibration/coverage@10%": 0.04752510764509184, "calibration/coverage@15%": 0.20209429824561403, "calibration/coverage@20%": 0.24744481676158198, "calibration/coverage@25%": 0.2579879298634856, "calibration/coverage@30%": 0.2579879298634856, "calibration/coverage@5%": 0.03864782304979158, "calibration/ece": 0.11076360391198636, "calibration/mean_confidence": 0.6773763935823934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011284722222222232, "completions/max_length": 3997.8, "completions/max_terminated_length": 3997.8, "completions/mean_length": 639.3073852539062, "completions/mean_terminated_length": 646.6291870117187, "completions/min_length": 0.0, "completions/min_terminated_length": 190.8, "epoch": 0.17999775002812465, "grad_norm": 0.0006053974502719939, "learning_rate": 1.8028846153846156e-06, "loss": -0.0072, "num_tokens": 140196599.0, "reward": 1.2024640083312987, "reward_std": 0.20960286855697632, "rewards/accuracy_reward": 0.653906238079071, "rewards/brier_reward": 0.7642894625663758, "rewards/confidence_one_or_zero": 0.0006076389050576836, "rewards/format_reward": 0.986718761920929, "rewards/mean_confidence_reward": 0.6664036393165589, "signal/accuracy_reward/centered_abs_mean": 0.20212131142616271, "signal/accuracy_reward/group_std_mean": 0.2648590564727783, "signal/accuracy_reward/group_zero_std_frac": 0.2555555611848831, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.10106065571308136, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.10106065571308136, "signal/advantage_abs_mean": 0.15458548367023467, "signal/advantage_pre_scale_abs_mean": 0.15458548367023467, "signal/advantage_pre_scale_std": 0.24506961107254027, "signal/advantage_std": 0.24506961107254027, "signal/brier_reward/centered_abs_mean": 0.10443567931652069, "signal/brier_reward/group_std_mean": 0.1381068229675293, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.052217839658260344, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.052217839658260344, "signal/confidence_one_or_zero/centered_abs_mean": 0.0011338975629769266, "signal/confidence_one_or_zero/group_std_mean": 0.0024878684431314468, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9888888835906983, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.133897491456537e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.133897491456537e-08, "signal/format_reward/centered_abs_mean": 0.02369249165058136, "signal/format_reward/group_std_mean": 0.04811062589287758, "signal/format_reward/group_zero_std_frac": 0.7888888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01184624582529068, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01184624582529068, "signal/mean_confidence_reward/centered_abs_mean": 0.07412965595722198, "signal/mean_confidence_reward/group_std_mean": 0.09953199625015259, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.412965715047903e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.412965715047903e-07, "step": 75 }, { "calibration/aurc": 0.31415098407121106, "calibration/batch_distribution_entropy": 0.46902417936271484, "calibration/confidence_entropy": 0.628581441245873, "calibration/coverage@0%": 0.0036883766143591167, "calibration/coverage@1%": 0.0036883766143591167, "calibration/coverage@10%": 0.008965421469240382, "calibration/coverage@15%": 0.10020552701013748, "calibration/coverage@20%": 0.22210526315789472, "calibration/coverage@25%": 0.2676061563831249, "calibration/coverage@30%": 0.4338051394805552, "calibration/coverage@5%": 0.0036883766143591167, "calibration/ece": 0.09603608896515689, "calibration/mean_confidence": 0.6583578991056717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013888888888888885, "completions/max_length": 3922.0, "completions/max_terminated_length": 3922.0, "completions/mean_length": 667.4120727539063, "completions/mean_terminated_length": 676.832470703125, "completions/min_length": 0.0, "completions/min_terminated_length": 172.0, "epoch": 0.19199760002999963, "grad_norm": 0.0007242803694680333, "learning_rate": 1.9230769230769234e-06, "loss": -0.0103, "num_tokens": 150938466.0, "reward": 1.1739783763885498, "reward_std": 0.20588041245937347, "rewards/accuracy_reward": 0.61328125, "rewards/brier_reward": 0.7505479693412781, "rewards/confidence_one_or_zero": 0.000260416668606922, "rewards/format_reward": 0.9841145873069763, "rewards/mean_confidence_reward": 0.6444980144500733, "signal/accuracy_reward/centered_abs_mean": 0.2073187917470932, "signal/accuracy_reward/group_std_mean": 0.26844447255134585, "signal/accuracy_reward/group_zero_std_frac": 0.2527777761220932, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1036593958735466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1036593958735466, "signal/advantage_abs_mean": 0.15543047487735748, "signal/advantage_pre_scale_abs_mean": 0.15543047487735748, "signal/advantage_pre_scale_std": 0.24077796339988708, "signal/advantage_std": 0.24077796339988708, "signal/brier_reward/centered_abs_mean": 0.09829773604869843, "signal/brier_reward/group_std_mean": 0.12834340631961821, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.049148868024349215, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.049148868024349215, "signal/confidence_one_or_zero/centered_abs_mean": 0.0005045572877861559, "signal/confidence_one_or_zero/group_std_mean": 0.0014731390401721, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 5.0455724931453e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 5.0455724931453e-09, "signal/format_reward/centered_abs_mean": 0.02540690116584301, "signal/format_reward/group_std_mean": 0.04631754383444786, "signal/format_reward/group_zero_std_frac": 0.8138888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012703450582921504, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012703450582921504, "signal/mean_confidence_reward/centered_abs_mean": 0.06891447007656097, "signal/mean_confidence_reward/group_std_mean": 0.09084324985742569, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.89144678744924e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.89144678744924e-07, "step": 80 }, { "calibration/aurc": 0.3747820357096822, "calibration/batch_distribution_entropy": 0.4269843920058415, "calibration/confidence_entropy": 0.6394617993427885, "calibration/coverage@0%": 0.0015888246084379853, "calibration/coverage@1%": 0.0015888246084379853, "calibration/coverage@10%": 0.0015888246084379853, "calibration/coverage@15%": 0.0015888246084379853, "calibration/coverage@20%": 0.019046140821221958, "calibration/coverage@25%": 0.026940877663327218, "calibration/coverage@30%": 0.03283900098772401, "calibration/coverage@5%": 0.0015888246084379853, "calibration/ece": 0.06526946038574769, "calibration/mean_confidence": 0.641408067096986, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011197916666666674, "completions/max_length": 3992.8, "completions/max_terminated_length": 3992.8, "completions/mean_length": 667.7173706054688, "completions/mean_terminated_length": 675.2990234375, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.2039974500318746, "grad_norm": 0.0007622387493029237, "learning_rate": 2.043269230769231e-06, "loss": -0.0088, "num_tokens": 161717770.0, "reward": 1.2014102220535279, "reward_std": 0.20460900366306306, "rewards/accuracy_reward": 0.6520833492279052, "rewards/brier_reward": 0.763571560382843, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9871527791023255, "rewards/mean_confidence_reward": 0.6351875066757202, "signal/accuracy_reward/centered_abs_mean": 0.20806206464767457, "signal/accuracy_reward/group_std_mean": 0.26917231678962705, "signal/accuracy_reward/group_zero_std_frac": 0.2527777820825577, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.10403103232383729, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.10403103232383729, "signal/advantage_abs_mean": 0.1508684515953064, "signal/advantage_pre_scale_abs_mean": 0.1508684515953064, "signal/advantage_pre_scale_std": 0.2377762407064438, "signal/advantage_std": 0.2377762407064438, "signal/brier_reward/centered_abs_mean": 0.0905149295926094, "signal/brier_reward/group_std_mean": 0.12293076515197754, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0452574647963047, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0452574647963047, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.02354600690305233, "signal/format_reward/group_std_mean": 0.051589816063642505, "signal/format_reward/group_zero_std_frac": 0.7638889074325561, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.011773003451526164, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.011773003451526164, "signal/mean_confidence_reward/centered_abs_mean": 0.0640238031744957, "signal/mean_confidence_reward/group_std_mean": 0.08753588497638702, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.402380108738725e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.402380108738725e-07, "step": 85 }, { "calibration/aurc": 0.3247442002818316, "calibration/batch_distribution_entropy": 0.39866431697763344, "calibration/confidence_entropy": 0.6398354816158724, "calibration/coverage@0%": 0.009935464128996051, "calibration/coverage@1%": 0.009935464128996051, "calibration/coverage@10%": 0.01829055551280806, "calibration/coverage@15%": 0.026667518863593403, "calibration/coverage@20%": 0.03983290556210882, "calibration/coverage@25%": 0.32806448132718, "calibration/coverage@30%": 0.4920057344294312, "calibration/coverage@5%": 0.01829055551280806, "calibration/ece": 0.07287273641423204, "calibration/mean_confidence": 0.6461269546434517, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015972222222222256, "completions/max_length": 3955.8, "completions/max_terminated_length": 3955.8, "completions/mean_length": 647.5657104492187, "completions/mean_terminated_length": 658.079052734375, "completions/min_length": 0.0, "completions/min_terminated_length": 204.6, "epoch": 0.2159973000337496, "grad_norm": 0.0006742156692780554, "learning_rate": 2.1634615384615387e-06, "loss": -0.0138, "num_tokens": 172246399.0, "reward": 1.190082883834839, "reward_std": 0.21081917583942414, "rewards/accuracy_reward": 0.6394965291023255, "rewards/brier_reward": 0.7583647847175599, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9822916746139526, "rewards/mean_confidence_reward": 0.6332682251930237, "signal/accuracy_reward/centered_abs_mean": 0.21086697280406952, "signal/accuracy_reward/group_std_mean": 0.2713586360216141, "signal/accuracy_reward/group_zero_std_frac": 0.2583333343267441, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.10543348640203476, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.10543348640203476, "signal/advantage_abs_mean": 0.15805128812789918, "signal/advantage_pre_scale_abs_mean": 0.15805128812789918, "signal/advantage_pre_scale_std": 0.2451910436153412, "signal/advantage_std": 0.2451910436153412, "signal/brier_reward/centered_abs_mean": 0.0934031143784523, "signal/brier_reward/group_std_mean": 0.12460439652204514, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04670155718922615, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04670155718922615, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0294921875, "signal/format_reward/group_std_mean": 0.05477819964289665, "signal/format_reward/group_zero_std_frac": 0.7750000119209289, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01474609375, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01474609375, "signal/mean_confidence_reward/centered_abs_mean": 0.06516138985753059, "signal/mean_confidence_reward/group_std_mean": 0.08637717664241791, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.516138796541781e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.516138796541781e-07, "step": 90 }, { "calibration/aurc": 0.3678564965542314, "calibration/batch_distribution_entropy": 0.4029568684473769, "calibration/confidence_entropy": 0.6350645390948854, "calibration/coverage@0%": 0.0015776243976280083, "calibration/coverage@1%": 0.0015776243976280083, "calibration/coverage@10%": 0.006289666282444763, "calibration/coverage@15%": 0.006289666282444763, "calibration/coverage@20%": 0.12310608690821727, "calibration/coverage@25%": 0.4111405835543767, "calibration/coverage@30%": 0.4111405835543767, "calibration/coverage@5%": 0.0015776243976280083, "calibration/ece": 0.11918611460384754, "calibration/mean_confidence": 0.6537054430168625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011371527777777769, "completions/max_length": 3645.8, "completions/max_terminated_length": 3645.8, "completions/mean_length": 662.3494140625, "completions/mean_terminated_length": 669.9871215820312, "completions/min_length": 0.0, "completions/min_terminated_length": 181.0, "epoch": 0.22799715003562457, "grad_norm": 0.0007057040347717702, "learning_rate": 2.283653846153846e-06, "loss": -0.0081, "num_tokens": 182968344.0, "reward": 1.1857956409454347, "reward_std": 0.1926214337348938, "rewards/accuracy_reward": 0.6279513835906982, "rewards/brier_reward": 0.7563873648643493, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9872395753860473, "rewards/mean_confidence_reward": 0.6442664980888366, "signal/accuracy_reward/centered_abs_mean": 0.19052734375, "signal/accuracy_reward/group_std_mean": 0.2476816862821579, "signal/accuracy_reward/group_zero_std_frac": 0.3194444477558136, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.095263671875, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.095263671875, "signal/advantage_abs_mean": 0.14359890520572663, "signal/advantage_pre_scale_abs_mean": 0.14359890520572663, "signal/advantage_pre_scale_std": 0.2281176596879959, "signal/advantage_std": 0.2281176596879959, "signal/brier_reward/centered_abs_mean": 0.08913325816392899, "signal/brier_reward/group_std_mean": 0.11816292852163315, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.044566629081964494, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.044566629081964494, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.02169596366584301, "signal/format_reward/group_std_mean": 0.042503207176923755, "signal/format_reward/group_zero_std_frac": 0.819444453716278, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.010847981832921506, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010847981832921506, "signal/mean_confidence_reward/centered_abs_mean": 0.060303809493780135, "signal/mean_confidence_reward/group_std_mean": 0.07880772352218628, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.030380745869479e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.030380745869479e-07, "step": 95 }, { "calibration/aurc": 0.30940192449927456, "calibration/batch_distribution_entropy": 0.3859611149460116, "calibration/confidence_entropy": 0.6269758168356413, "calibration/coverage@0%": 0.0015873909372361382, "calibration/coverage@1%": 0.0015873909372361382, "calibration/coverage@10%": 0.0074540576039028045, "calibration/coverage@15%": 0.01332072427056947, "calibration/coverage@20%": 0.1354259874284642, "calibration/coverage@25%": 0.24155932076179756, "calibration/coverage@30%": 0.5579411066586616, "calibration/coverage@5%": 0.0015873909372361382, "calibration/ece": 0.0656766320954025, "calibration/mean_confidence": 0.6680082649537453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014149305555555537, "completions/max_length": 3573.2, "completions/max_terminated_length": 3573.2, "completions/mean_length": 680.751123046875, "completions/mean_terminated_length": 690.4747802734375, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.23999700003749952, "grad_norm": 0.0005587974446825683, "learning_rate": 2.403846153846154e-06, "loss": -0.0092, "num_tokens": 193909669.0, "reward": 1.1963390111923218, "reward_std": 0.19775362610816954, "rewards/accuracy_reward": 0.6446180582046509, "rewards/brier_reward": 0.7633245348930359, "rewards/confidence_one_or_zero": 0.000260416668606922, "rewards/format_reward": 0.9847222328186035, "rewards/mean_confidence_reward": 0.6531527757644653, "signal/accuracy_reward/centered_abs_mean": 0.19210069477558137, "signal/accuracy_reward/group_std_mean": 0.25235746800899506, "signal/accuracy_reward/group_zero_std_frac": 0.29166666567325594, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09605034738779068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.09605034738779068, "signal/advantage_abs_mean": 0.1445866495370865, "signal/advantage_pre_scale_abs_mean": 0.1445866495370865, "signal/advantage_pre_scale_std": 0.23335390090942382, "signal/advantage_std": 0.23335390090942382, "signal/brier_reward/centered_abs_mean": 0.09065800160169601, "signal/brier_reward/group_std_mean": 0.12145532816648483, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04532900080084801, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04532900080084801, "signal/confidence_one_or_zero/centered_abs_mean": 0.0005045572877861559, "signal/confidence_one_or_zero/group_std_mean": 0.0014731390401721, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 5.0455724931453e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 5.0455724931453e-09, "signal/format_reward/centered_abs_mean": 0.02416449636220932, "signal/format_reward/group_std_mean": 0.04657035693526268, "signal/format_reward/group_zero_std_frac": 0.8055555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01208224818110466, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01208224818110466, "signal/mean_confidence_reward/centered_abs_mean": 0.05831857323646546, "signal/mean_confidence_reward/group_std_mean": 0.07910713404417039, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.831857151861186e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.831857151861186e-07, "step": 100 }, { "epoch": 0.23999700003749952, "eval_calibration/aurc": 0.3114558181468945, "eval_calibration/batch_distribution_entropy": 0.35943074260483643, "eval_calibration/confidence_entropy": 0.631874775458992, "eval_calibration/coverage@0%": 0.03662634408602151, "eval_calibration/coverage@1%": 0.03662634408602151, "eval_calibration/coverage@10%": 0.03662634408602151, "eval_calibration/coverage@15%": 0.03662634408602151, "eval_calibration/coverage@20%": 0.03662634408602151, "eval_calibration/coverage@25%": 0.359375, "eval_calibration/coverage@30%": 0.671875, "eval_calibration/coverage@5%": 0.03662634408602151, "eval_calibration/ece": 0.12016969086021506, "eval_calibration/mean_confidence": 0.6602066532258064, "eval_completions/clipped_ratio": 0.010416666666666685, "eval_completions/max_length": 2313.6666666666665, "eval_completions/max_terminated_length": 2313.6666666666665, "eval_completions/mean_length": 665.7246704101562, "eval_completions/mean_terminated_length": 672.8719177246094, "eval_completions/min_length": 66.33333333333333, "eval_completions/min_terminated_length": 257.5, "eval_loss": 0.0, "eval_num_tokens": 193909669.0, "eval_reward": 1.2051163117090862, "eval_reward_std": 0.341262087225914, "eval_rewards/accuracy_reward": 0.6553819378217062, "eval_rewards/brier_reward": 0.7669900357723236, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9878472288449606, "eval_rewards/mean_confidence_reward": 0.6588975489139557, "eval_runtime": 203.115, "eval_samples_per_second": 4.923, "eval_signal/accuracy_reward/centered_abs_mean": 0.4401584168275197, "eval_signal/accuracy_reward/group_std_mean": 0.47595187028249103, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.22007920841375986, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.22007920841375986, "eval_signal/advantage_abs_mean": 0.30565472940603894, "eval_signal/advantage_pre_scale_abs_mean": 0.30565472940603894, "eval_signal/advantage_pre_scale_std": 0.33811872204144794, "eval_signal/advantage_std": 0.33811872204144794, "eval_signal/brier_reward/centered_abs_mean": 0.1567381595571836, "eval_signal/brier_reward/group_std_mean": 0.1857337479790052, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0783690797785918, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.0783690797785918, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.023437499689559143, "eval_signal/format_reward/group_std_mean": 0.0657570840169986, "eval_signal/format_reward/group_zero_std_frac": 0.6388889104127884, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.011718749844779571, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.011718749844779571, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.058930120120445885, "eval_signal/mean_confidence_reward/group_std_mean": 0.08698150143027306, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.893011518764979e-07, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 5.893011518764979e-07, "eval_steps_per_second": 0.03, "step": 100 }, { "epoch": 0.23999700003749952, "step": 100, "train_probe_calibration/aurc": 0.3424971747171755, "train_probe_calibration/batch_distribution_entropy": 0.3750340797290245, "train_probe_calibration/confidence_entropy": 0.6293407942965378, "train_probe_calibration/coverage@0%": 0.020833333333333332, "train_probe_calibration/coverage@1%": 0.020833333333333332, "train_probe_calibration/coverage@10%": 0.020833333333333332, "train_probe_calibration/coverage@15%": 0.020833333333333332, "train_probe_calibration/coverage@20%": 0.020833333333333332, "train_probe_calibration/coverage@25%": 0.11458333333333333, "train_probe_calibration/coverage@30%": 0.3541666666666667, "train_probe_calibration/coverage@5%": 0.020833333333333332, "train_probe_calibration/ece": 0.07714213709677416, "train_probe_calibration/mean_confidence": 0.6633988575268818, "train_probe_completions/clipped_ratio": 0.018055555555555547, "train_probe_completions/max_length": 2835.6666666666665, "train_probe_completions/max_terminated_length": 2835.6666666666665, "train_probe_completions/mean_length": 660.6500142415365, "train_probe_completions/mean_terminated_length": 672.730702718099, "train_probe_completions/min_length": 0.0, "train_probe_completions/min_terminated_length": 241.33333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 193909669.0, "train_probe_reward": 1.203997532526652, "train_probe_reward_std": 0.34814903636773425, "train_probe_rewards/accuracy_reward": 0.6579861144224802, "train_probe_rewards/brier_reward": 0.7682248055934906, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9817708333333334, "train_probe_rewards/mean_confidence_reward": 0.6543402870496114, "train_probe_runtime": 206.1766, "train_probe_samples_per_second": 4.85, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4377170155445735, "train_probe_signal/accuracy_reward/group_std_mean": 0.47431273261706036, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21885850777228674, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.21885850777228674, "train_probe_signal/advantage_abs_mean": 0.30768390993277234, "train_probe_signal/advantage_pre_scale_abs_mean": 0.30768390993277234, "train_probe_signal/advantage_pre_scale_std": 0.34619641800721485, "train_probe_signal/advantage_std": 0.34619641800721485, "train_probe_signal/brier_reward/centered_abs_mean": 0.1566829557220141, "train_probe_signal/brier_reward/group_std_mean": 0.19027900944153467, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07834147786100705, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07834147786100705, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.034884982431928314, "train_probe_signal/format_reward/group_std_mean": 0.09116210353871186, "train_probe_signal/format_reward/group_zero_std_frac": 0.5277777860562006, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.017442491215964157, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.017442491215964157, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.06655815243721008, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.10166954745848973, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.655815430652486e-07, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 6.655815430652486e-07, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.3968421312687138, "calibration/batch_distribution_entropy": 0.3669186564554883, "calibration/confidence_entropy": 0.6241699610330542, "calibration/coverage@0%": 0.00633245382585752, "calibration/coverage@1%": 0.00633245382585752, "calibration/coverage@10%": 0.00633245382585752, "calibration/coverage@15%": 0.00633245382585752, "calibration/coverage@20%": 0.014775725593667546, "calibration/coverage@25%": 0.1507179404947594, "calibration/coverage@30%": 0.21762618467175399, "calibration/coverage@5%": 0.00633245382585752, "calibration/ece": 0.11595529539153711, "calibration/mean_confidence": 0.6747351307783813, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3581.8, "completions/max_terminated_length": 3581.8, "completions/mean_length": 673.0084350585937, "completions/mean_terminated_length": 683.7401123046875, "completions/min_length": 0.0, "completions/min_terminated_length": 196.2, "epoch": 0.2519968500393745, "grad_norm": 0.0005546111497096717, "learning_rate": 2.5240384615384618e-06, "loss": -0.012, "num_tokens": 204739590.0, "reward": 1.19893217086792, "reward_std": 0.19751378893852234, "rewards/accuracy_reward": 0.6484375119209289, "rewards/brier_reward": 0.7657330870628357, "rewards/confidence_one_or_zero": 0.000260416668606922, "rewards/format_reward": 0.9836805582046508, "rewards/mean_confidence_reward": 0.658142352104187, "signal/accuracy_reward/centered_abs_mean": 0.18950737714767457, "signal/accuracy_reward/group_std_mean": 0.24972547888755797, "signal/accuracy_reward/group_zero_std_frac": 0.3027777850627899, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09475368857383729, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.09475368857383729, "signal/advantage_abs_mean": 0.14571246653795242, "signal/advantage_pre_scale_abs_mean": 0.14571246653795242, "signal/advantage_pre_scale_std": 0.23486922085285186, "signal/advantage_std": 0.23486922085285186, "signal/brier_reward/centered_abs_mean": 0.0919464573264122, "signal/brier_reward/group_std_mean": 0.12245310842990875, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0459732286632061, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0459732286632061, "signal/confidence_one_or_zero/centered_abs_mean": 0.0004937065881676972, "signal/confidence_one_or_zero/group_std_mean": 0.001174198230728507, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.9370658672387435e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.9370658672387435e-09, "signal/format_reward/centered_abs_mean": 0.025835503824055195, "signal/format_reward/group_std_mean": 0.04826897084712982, "signal/format_reward/group_zero_std_frac": 0.8, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012917751912027597, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012917751912027597, "signal/mean_confidence_reward/centered_abs_mean": 0.05578613579273224, "signal/mean_confidence_reward/group_std_mean": 0.07785800248384475, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.578613468060212e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.578613468060212e-07, "step": 105 }, { "calibration/aurc": 0.3238293436923214, "calibration/batch_distribution_entropy": 0.362072415892365, "calibration/confidence_entropy": 0.6225513897425031, "calibration/coverage@0%": 0.002626273375955622, "calibration/coverage@1%": 0.002626273375955622, "calibration/coverage@10%": 0.008370398702326379, "calibration/coverage@15%": 0.012547944394232385, "calibration/coverage@20%": 0.0240361950469739, "calibration/coverage@25%": 0.1619957320542762, "calibration/coverage@30%": 0.5067696124389204, "calibration/coverage@5%": 0.002626273375955622, "calibration/ece": 0.06648445556203152, "calibration/mean_confidence": 0.6757644819200216, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015017361111111117, "completions/max_length": 3796.4, "completions/max_terminated_length": 3796.4, "completions/mean_length": 684.4747436523437, "completions/mean_terminated_length": 694.894287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.2639967000412495, "grad_norm": 0.0007070910651236773, "learning_rate": 2.6442307692307696e-06, "loss": -0.0122, "num_tokens": 215733187.0, "reward": 1.2193590164184571, "reward_std": 0.19997969567775725, "rewards/accuracy_reward": 0.6770833253860473, "rewards/brier_reward": 0.777333116531372, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9842881917953491, "rewards/mean_confidence_reward": 0.6580425381660462, "signal/accuracy_reward/centered_abs_mean": 0.19500868022441864, "signal/accuracy_reward/group_std_mean": 0.25922795832157136, "signal/accuracy_reward/group_zero_std_frac": 0.26666666865348815, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09750434011220932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.09750434011220932, "signal/advantage_abs_mean": 0.1463734030723572, "signal/advantage_pre_scale_abs_mean": 0.1463734030723572, "signal/advantage_pre_scale_std": 0.23799879252910613, "signal/advantage_std": 0.23799879252910613, "signal/brier_reward/centered_abs_mean": 0.09076128154993057, "signal/brier_reward/group_std_mean": 0.121206896007061, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.045380640774965286, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.045380640774965286, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.02450629398226738, "signal/format_reward/group_std_mean": 0.04501023888587952, "signal/format_reward/group_zero_std_frac": 0.8166666746139526, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01225314699113369, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01225314699113369, "signal/mean_confidence_reward/centered_abs_mean": 0.05382351577281952, "signal/mean_confidence_reward/group_std_mean": 0.07404199987649918, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.38235167368839e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.38235167368839e-07, "step": 110 }, { "calibration/aurc": 0.3984555175196269, "calibration/batch_distribution_entropy": 0.38057734453295144, "calibration/confidence_entropy": 0.6275560124648408, "calibration/coverage@0%": 0.0031277268760907507, "calibration/coverage@1%": 0.0031277268760907507, "calibration/coverage@10%": 0.0031277268760907507, "calibration/coverage@15%": 0.0031277268760907507, "calibration/coverage@20%": 0.010026633979915886, "calibration/coverage@25%": 0.20325580064658255, "calibration/coverage@30%": 0.4005235602094241, "calibration/coverage@5%": 0.0031277268760907507, "calibration/ece": 0.1474570584775872, "calibration/mean_confidence": 0.6628758820350109, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01944444444444442, "completions/max_length": 3656.2, "completions/max_terminated_length": 3656.2, "completions/mean_length": 672.4917602539062, "completions/mean_terminated_length": 685.9388061523438, "completions/min_length": 0.0, "completions/min_terminated_length": 206.2, "epoch": 0.27599655004312446, "grad_norm": 0.0006142130587249994, "learning_rate": 2.7644230769230775e-06, "loss": -0.0177, "num_tokens": 226559492.0, "reward": 1.1902765035629272, "reward_std": 0.19237456023693084, "rewards/accuracy_reward": 0.6390625, "rewards/brier_reward": 0.7610952854156494, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9803819417953491, "rewards/mean_confidence_reward": 0.6560199618339538, "signal/accuracy_reward/centered_abs_mean": 0.18200954794883728, "signal/accuracy_reward/group_std_mean": 0.23614366054534913, "signal/accuracy_reward/group_zero_std_frac": 0.33333333730697634, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09100477397441864, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.09100477397441864, "signal/advantage_abs_mean": 0.14305781871080397, "signal/advantage_pre_scale_abs_mean": 0.14305781871080397, "signal/advantage_pre_scale_std": 0.2359527140855789, "signal/advantage_std": 0.2359527140855789, "signal/brier_reward/centered_abs_mean": 0.09261435866355897, "signal/brier_reward/group_std_mean": 0.12086576223373413, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04630717933177948, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04630717933177948, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003255208255723119, "signal/confidence_one_or_zero/group_std_mean": 0.0006831518840044737, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.2552080142522754e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.2552080142522754e-09, "signal/format_reward/centered_abs_mean": 0.031065538339316846, "signal/format_reward/group_std_mean": 0.05347142927348614, "signal/format_reward/group_zero_std_frac": 0.800000011920929, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.015532769169658423, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.015532769169658423, "signal/mean_confidence_reward/centered_abs_mean": 0.05739936009049416, "signal/mean_confidence_reward/group_std_mean": 0.07904537916183471, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.739936227655562e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.739936227655562e-07, "step": 115 }, { "calibration/aurc": 0.36055781214333466, "calibration/batch_distribution_entropy": 0.3816988478552697, "calibration/confidence_entropy": 0.6244228061835957, "calibration/coverage@0%": 0.0047383704696487405, "calibration/coverage@1%": 0.0047383704696487405, "calibration/coverage@10%": 0.012087451834478137, "calibration/coverage@15%": 0.016799493719294893, "calibration/coverage@20%": 0.18001022814772633, "calibration/coverage@25%": 0.3508282737977792, "calibration/coverage@30%": 0.4277916245831196, "calibration/coverage@5%": 0.012087451834478137, "calibration/ece": 0.12751064955669236, "calibration/mean_confidence": 0.6683955364056197, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011979166666666674, "completions/max_length": 3239.4, "completions/max_terminated_length": 3239.4, "completions/mean_length": 671.404443359375, "completions/mean_terminated_length": 679.5443115234375, "completions/min_length": 0.0, "completions/min_terminated_length": 220.8, "epoch": 0.28799640004499943, "grad_norm": 0.0005066812736913562, "learning_rate": 2.8846153846153845e-06, "loss": -0.0102, "num_tokens": 237375927.0, "reward": 1.2086429357528687, "reward_std": 0.18110458552837372, "rewards/accuracy_reward": 0.6574652671813965, "rewards/brier_reward": 0.7718732714653015, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9879340171813965, "rewards/mean_confidence_reward": 0.6636371493339539, "signal/accuracy_reward/centered_abs_mean": 0.1777994751930237, "signal/accuracy_reward/group_std_mean": 0.23412391245365144, "signal/accuracy_reward/group_zero_std_frac": 0.33333333730697634, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08889973759651185, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08889973759651185, "signal/advantage_abs_mean": 0.1341766595840454, "signal/advantage_pre_scale_abs_mean": 0.1341766595840454, "signal/advantage_pre_scale_std": 0.22400145828723908, "signal/advantage_std": 0.22400145828723908, "signal/brier_reward/centered_abs_mean": 0.08540225774049759, "signal/brier_reward/group_std_mean": 0.11299550235271454, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.042701128870248795, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.042701128870248795, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.019471571408212186, "signal/format_reward/group_std_mean": 0.035466664284467694, "signal/format_reward/group_zero_std_frac": 0.8555555462837219, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.009735785704106093, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009735785704106093, "signal/mean_confidence_reward/centered_abs_mean": 0.04995632916688919, "signal/mean_confidence_reward/group_std_mean": 0.06900080442428588, "signal/mean_confidence_reward/group_zero_std_frac": 0.002777777798473835, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.995632764348556e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.995632764348556e-07, "step": 120 }, { "calibration/aurc": 0.28509394561381657, "calibration/batch_distribution_entropy": 0.3228088057534171, "calibration/confidence_entropy": 0.6248792534792422, "calibration/coverage@0%": 0.004799310124232847, "calibration/coverage@1%": 0.004799310124232847, "calibration/coverage@10%": 0.010543435450603605, "calibration/coverage@15%": 0.15625447942692722, "calibration/coverage@20%": 0.3529709503850561, "calibration/coverage@25%": 0.4145983379501385, "calibration/coverage@30%": 0.4145983379501385, "calibration/coverage@5%": 0.004799310124232847, "calibration/ece": 0.09946176516240221, "calibration/mean_confidence": 0.6740883957287108, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017100694444444443, "completions/max_length": 3132.8, "completions/max_terminated_length": 3132.8, "completions/mean_length": 665.89072265625, "completions/mean_terminated_length": 677.5413330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 221.6, "epoch": 0.2999962500468744, "grad_norm": 0.0007231877534650266, "learning_rate": 3.0048076923076923e-06, "loss": -0.0174, "num_tokens": 248164652.0, "reward": 1.2162556171417236, "reward_std": 0.18271680176258087, "rewards/accuracy_reward": 0.6735243082046509, "rewards/brier_reward": 0.7763347506523133, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9826388955116272, "rewards/mean_confidence_reward": 0.659646725654602, "signal/accuracy_reward/centered_abs_mean": 0.17083875834941864, "signal/accuracy_reward/group_std_mean": 0.22510201036930083, "signal/accuracy_reward/group_zero_std_frac": 0.3666666686534882, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08541937917470932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08541937917470932, "signal/advantage_abs_mean": 0.13394537568092346, "signal/advantage_pre_scale_abs_mean": 0.13394537568092346, "signal/advantage_pre_scale_std": 0.23040358424186708, "signal/advantage_std": 0.23040358424186708, "signal/brier_reward/centered_abs_mean": 0.08513147979974747, "signal/brier_reward/group_std_mean": 0.11398477852344513, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.042565739899873736, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.042565739899873736, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02859157957136631, "signal/format_reward/group_std_mean": 0.0496931079775095, "signal/format_reward/group_zero_std_frac": 0.8111111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.014295789785683155, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.014295789785683155, "signal/mean_confidence_reward/centered_abs_mean": 0.05194374620914459, "signal/mean_confidence_reward/group_std_mean": 0.07293412387371064, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.194374409711599e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.194374409711599e-07, "step": 125 }, { "calibration/aurc": 0.3523769550594987, "calibration/batch_distribution_entropy": 0.3730742715910721, "calibration/confidence_entropy": 0.6311129894171271, "calibration/coverage@0%": 0.002684138234970069, "calibration/coverage@1%": 0.002684138234970069, "calibration/coverage@10%": 0.002684138234970069, "calibration/coverage@15%": 0.002684138234970069, "calibration/coverage@20%": 0.002684138234970069, "calibration/coverage@25%": 0.14485437009734853, "calibration/coverage@30%": 0.45235186310269493, "calibration/coverage@5%": 0.002684138234970069, "calibration/ece": 0.08197692275911814, "calibration/mean_confidence": 0.660474184890275, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02352430555555556, "completions/max_length": 3590.2, "completions/max_terminated_length": 3590.2, "completions/mean_length": 704.85166015625, "completions/mean_terminated_length": 721.8248291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 205.6, "epoch": 0.3119961000487494, "grad_norm": 0.0006113589624874294, "learning_rate": 3.125e-06, "loss": -0.0207, "num_tokens": 259409343.0, "reward": 1.191517186164856, "reward_std": 0.202205029129982, "rewards/accuracy_reward": 0.6450520873069763, "rewards/brier_reward": 0.7622747302055359, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.975694453716278, "rewards/mean_confidence_reward": 0.645373260974884, "signal/accuracy_reward/centered_abs_mean": 0.19106445610523223, "signal/accuracy_reward/group_std_mean": 0.2509522706270218, "signal/accuracy_reward/group_zero_std_frac": 0.29166666567325594, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.09553222805261612, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.09553222805261612, "signal/advantage_abs_mean": 0.1484668254852295, "signal/advantage_pre_scale_abs_mean": 0.1484668254852295, "signal/advantage_pre_scale_std": 0.24313803315162658, "signal/advantage_std": 0.24313803315162658, "signal/brier_reward/centered_abs_mean": 0.08987482637166977, "signal/brier_reward/group_std_mean": 0.12072644084692001, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.044937413185834885, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.044937413185834885, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.03408203050494194, "signal/format_reward/group_std_mean": 0.06009537056088447, "signal/format_reward/group_zero_std_frac": 0.7666666865348816, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01704101525247097, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01704101525247097, "signal/mean_confidence_reward/centered_abs_mean": 0.056934678554534913, "signal/mean_confidence_reward/group_std_mean": 0.07846869677305221, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.693467755918391e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.693467755918391e-07, "step": 130 }, { "calibration/aurc": 0.3075408569848155, "calibration/batch_distribution_entropy": 0.35581563934470617, "calibration/confidence_entropy": 0.6290419743007951, "calibration/coverage@0%": 0.006770833333333333, "calibration/coverage@1%": 0.006770833333333333, "calibration/coverage@10%": 0.20646900269541776, "calibration/coverage@15%": 0.20646900269541776, "calibration/coverage@20%": 0.20646900269541776, "calibration/coverage@25%": 0.32776280323450135, "calibration/coverage@30%": 0.4, "calibration/coverage@5%": 0.006770833333333333, "calibration/ece": 0.12706097813011646, "calibration/mean_confidence": 0.6632383123565602, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02144097222222221, "completions/max_length": 3406.2, "completions/max_terminated_length": 3406.2, "completions/mean_length": 689.4444458007813, "completions/mean_terminated_length": 704.5305786132812, "completions/min_length": 0.0, "completions/min_terminated_length": 200.8, "epoch": 0.32399595005062437, "grad_norm": 0.0006831000209785998, "learning_rate": 3.245192307692308e-06, "loss": -0.02, "num_tokens": 270444767.0, "reward": 1.21013400554657, "reward_std": 0.1862725466489792, "rewards/accuracy_reward": 0.6684895873069763, "rewards/brier_reward": 0.7733800053596497, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9783854246139526, "rewards/mean_confidence_reward": 0.64532550573349, "signal/accuracy_reward/centered_abs_mean": 0.16418727934360505, "signal/accuracy_reward/group_std_mean": 0.22129693925380706, "signal/accuracy_reward/group_zero_std_frac": 0.3638888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08209363967180253, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08209363967180253, "signal/advantage_abs_mean": 0.1319579228758812, "signal/advantage_pre_scale_abs_mean": 0.1319579228758812, "signal/advantage_pre_scale_std": 0.23068281412124633, "signal/advantage_std": 0.23068281412124633, "signal/brier_reward/centered_abs_mean": 0.08366121798753738, "signal/brier_reward/group_std_mean": 0.11530399322509766, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04183060899376869, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04183060899376869, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.03651801273226738, "signal/format_reward/group_std_mean": 0.06655258983373642, "signal/format_reward/group_zero_std_frac": 0.7361111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01825900636613369, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01825900636613369, "signal/mean_confidence_reward/centered_abs_mean": 0.05818576216697693, "signal/mean_confidence_reward/group_std_mean": 0.08201179653406143, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.818576369165385e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.818576369165385e-07, "step": 135 }, { "calibration/aurc": 0.25876096173136887, "calibration/batch_distribution_entropy": 0.36616209260021615, "calibration/confidence_entropy": 0.6183446937704404, "calibration/coverage@0%": 0.005828297419981208, "calibration/coverage@1%": 0.005828297419981208, "calibration/coverage@10%": 0.005828297419981208, "calibration/coverage@15%": 0.045721021593716486, "calibration/coverage@20%": 0.2344463328084702, "calibration/coverage@25%": 0.5391183827875107, "calibration/coverage@30%": 0.756735480574539, "calibration/coverage@5%": 0.005828297419981208, "calibration/ece": 0.0660076350482702, "calibration/mean_confidence": 0.6786328518081711, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02065972222222221, "completions/max_length": 3374.0, "completions/max_terminated_length": 3374.0, "completions/mean_length": 682.2421020507812, "completions/mean_terminated_length": 696.599853515625, "completions/min_length": 0.0, "completions/min_terminated_length": 215.2, "epoch": 0.33599580005249935, "grad_norm": 0.0005992869264446199, "learning_rate": 3.365384615384616e-06, "loss": -0.0206, "num_tokens": 281408420.0, "reward": 1.204754900932312, "reward_std": 0.18304466903209687, "rewards/accuracy_reward": 0.6586805582046509, "rewards/brier_reward": 0.771822714805603, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9789930582046509, "rewards/mean_confidence_reward": 0.6663758635520936, "signal/accuracy_reward/centered_abs_mean": 0.1638129323720932, "signal/accuracy_reward/group_std_mean": 0.21214520335197448, "signal/accuracy_reward/group_zero_std_frac": 0.4111111104488373, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0819064661860466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0819064661860466, "signal/advantage_abs_mean": 0.13533268719911576, "signal/advantage_pre_scale_abs_mean": 0.13533268719911576, "signal/advantage_pre_scale_std": 0.2357187330722809, "signal/advantage_std": 0.2357187330722809, "signal/brier_reward/centered_abs_mean": 0.0858886182308197, "signal/brier_reward/group_std_mean": 0.11624894440174102, "signal/brier_reward/group_zero_std_frac": 0.002777777798473835, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04294430911540985, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04294430911540985, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.03479817770421505, "signal/format_reward/group_std_mean": 0.06239920780062676, "signal/format_reward/group_zero_std_frac": 0.7555555701255798, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.017399088852107523, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.017399088852107523, "signal/mean_confidence_reward/centered_abs_mean": 0.048192010074853894, "signal/mean_confidence_reward/group_std_mean": 0.07439302504062653, "signal/mean_confidence_reward/group_zero_std_frac": 0.002777777798473835, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.819201024020003e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.819201024020003e-07, "step": 140 }, { "calibration/aurc": 0.30395678877779303, "calibration/batch_distribution_entropy": 0.39360683860759516, "calibration/confidence_entropy": 0.6159341297579106, "calibration/coverage@0%": 0.005277044854881266, "calibration/coverage@1%": 0.005277044854881266, "calibration/coverage@10%": 0.02058047493403694, "calibration/coverage@15%": 0.02058047493403694, "calibration/coverage@20%": 0.03818047493403694, "calibration/coverage@25%": 0.4172129634706331, "calibration/coverage@30%": 0.45416916399833757, "calibration/coverage@5%": 0.005277044854881266, "calibration/ece": 0.0944133050460224, "calibration/mean_confidence": 0.6783376892015438, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017274305555555557, "completions/max_length": 3479.2, "completions/max_terminated_length": 3479.2, "completions/mean_length": 665.6880126953125, "completions/mean_terminated_length": 677.5132202148437, "completions/min_length": 0.0, "completions/min_terminated_length": 209.4, "epoch": 0.34799565005437433, "grad_norm": 0.0006123962230049074, "learning_rate": 3.4855769230769233e-06, "loss": -0.0172, "num_tokens": 292141754.0, "reward": 1.23957302570343, "reward_std": 0.1664572149515152, "rewards/accuracy_reward": 0.7046006917953491, "rewards/brier_reward": 0.7918928861618042, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9826388835906983, "rewards/mean_confidence_reward": 0.6715477585792542, "signal/accuracy_reward/centered_abs_mean": 0.14133571982383727, "signal/accuracy_reward/group_std_mean": 0.19653923213481903, "signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07066785991191864, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07066785991191864, "signal/advantage_abs_mean": 0.11513669937849044, "signal/advantage_pre_scale_abs_mean": 0.11513669937849044, "signal/advantage_pre_scale_std": 0.21813906133174896, "signal/advantage_std": 0.21813906133174896, "signal/brier_reward/centered_abs_mean": 0.07627894431352615, "signal/brier_reward/group_std_mean": 0.10811058431863785, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.038139472156763075, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.038139472156763075, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02968749925494194, "signal/format_reward/group_std_mean": 0.05382074937224388, "signal/format_reward/group_zero_std_frac": 0.7888888835906982, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01484374962747097, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01484374962747097, "signal/mean_confidence_reward/centered_abs_mean": 0.047725367546081546, "signal/mean_confidence_reward/group_std_mean": 0.07114448547363281, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.77253706776537e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.77253706776537e-07, "step": 145 }, { "calibration/aurc": 0.251996754365757, "calibration/batch_distribution_entropy": 0.45746643480176435, "calibration/confidence_entropy": 0.6179645028884286, "calibration/coverage@0%": 0.013260924380017058, "calibration/coverage@1%": 0.013260924380017058, "calibration/coverage@10%": 0.19973721380082005, "calibration/coverage@15%": 0.23367977254755629, "calibration/coverage@20%": 0.3721824463443477, "calibration/coverage@25%": 0.5101195631234725, "calibration/coverage@30%": 0.5101195631234725, "calibration/coverage@5%": 0.013260924380017058, "calibration/ece": 0.12633145729920384, "calibration/mean_confidence": 0.6679208784411201, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01475694444444442, "completions/max_length": 3322.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 720.0317749023437, "completions/mean_terminated_length": 730.7849731445312, "completions/min_length": 0.0, "completions/min_terminated_length": 211.4, "epoch": 0.3599955000562493, "grad_norm": 0.0006007853080518544, "learning_rate": 3.605769230769231e-06, "loss": -0.0124, "num_tokens": 303546856.0, "reward": 1.2211382150650025, "reward_std": 0.17774512469768525, "rewards/accuracy_reward": 0.6747395873069764, "rewards/brier_reward": 0.782454001903534, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9850694537162781, "rewards/mean_confidence_reward": 0.663671875, "signal/accuracy_reward/centered_abs_mean": 0.1687011733651161, "signal/accuracy_reward/group_std_mean": 0.22048845887184143, "signal/accuracy_reward/group_zero_std_frac": 0.37222222089767454, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08435058668255806, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08435058668255806, "signal/advantage_abs_mean": 0.13075250685214995, "signal/advantage_pre_scale_abs_mean": 0.13075250685214995, "signal/advantage_pre_scale_std": 0.22415777146816254, "signal/advantage_std": 0.22415777146816254, "signal/brier_reward/centered_abs_mean": 0.08473162204027176, "signal/brier_reward/group_std_mean": 0.11439146846532822, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04236581102013588, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04236581102013588, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02452256940305233, "signal/format_reward/group_std_mean": 0.048194213211536406, "signal/format_reward/group_zero_std_frac": 0.7916666865348816, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012261284701526164, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012261284701526164, "signal/mean_confidence_reward/centered_abs_mean": 0.0537755012512207, "signal/mean_confidence_reward/group_std_mean": 0.07779382765293122, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.377549882723543e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.377549882723543e-07, "step": 150 }, { "epoch": 0.3599955000562493, "eval_calibration/aurc": 0.23776656701099763, "eval_calibration/batch_distribution_entropy": 0.42524055884820533, "eval_calibration/confidence_entropy": 0.6214056419373937, "eval_calibration/coverage@0%": 0.06384408602150538, "eval_calibration/coverage@1%": 0.06384408602150538, "eval_calibration/coverage@10%": 0.1713709677419355, "eval_calibration/coverage@15%": 0.21438172043010753, "eval_calibration/coverage@20%": 0.32913306451612906, "eval_calibration/coverage@25%": 0.5843413978494624, "eval_calibration/coverage@30%": 0.8870967741935484, "eval_calibration/coverage@5%": 0.06384408602150538, "eval_calibration/ece": 0.13340893817204305, "eval_calibration/mean_confidence": 0.6707745295698925, "eval_completions/clipped_ratio": 0.011284722222222229, "eval_completions/max_length": 2065.3333333333335, "eval_completions/max_terminated_length": 2065.3333333333335, "eval_completions/mean_length": 687.8250122070312, "eval_completions/mean_terminated_length": 695.781494140625, "eval_completions/min_length": 61.333333333333336, "eval_completions/min_terminated_length": 280.0, "eval_loss": 0.0, "eval_num_tokens": 303546856.0, "eval_reward": 1.2187056342760723, "eval_reward_std": 0.3328238030274709, "eval_rewards/accuracy_reward": 0.6657986044883728, "eval_rewards/brier_reward": 0.7837521930535635, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.987847218910853, "eval_rewards/mean_confidence_reward": 0.6589843730131785, "eval_runtime": 190.9235, "eval_samples_per_second": 5.238, "eval_signal/accuracy_reward/centered_abs_mean": 0.4261610259612401, "eval_signal/accuracy_reward/group_std_mean": 0.4678809891144435, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21308051298062006, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21308051298062006, "eval_signal/advantage_abs_mean": 0.2922206570704778, "eval_signal/advantage_pre_scale_abs_mean": 0.2922206570704778, "eval_signal/advantage_pre_scale_std": 0.33035990099112195, "eval_signal/advantage_std": 0.33035990099112195, "eval_signal/brier_reward/centered_abs_mean": 0.1495609680811564, "eval_signal/brier_reward/group_std_mean": 0.18264500300089517, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0747804840405782, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.0747804840405782, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.023328993159035843, "eval_signal/format_reward/group_std_mean": 0.06276767452557881, "eval_signal/format_reward/group_zero_std_frac": 0.6666666766007742, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.011664496579517921, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.011664496579517921, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.06980522970358531, "eval_signal/mean_confidence_reward/group_std_mean": 0.10301572953661282, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.980522850123331e-07, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 6.980522850123331e-07, "eval_steps_per_second": 0.031, "step": 150 }, { "epoch": 0.3599955000562493, "step": 150, "train_probe_calibration/aurc": 0.23897342522938161, "train_probe_calibration/batch_distribution_entropy": 0.511896602598975, "train_probe_calibration/confidence_entropy": 0.6177175620212864, "train_probe_calibration/coverage@0%": 0.06821236559139786, "train_probe_calibration/coverage@1%": 0.06821236559139786, "train_probe_calibration/coverage@10%": 0.1723790322580645, "train_probe_calibration/coverage@15%": 0.1723790322580645, "train_probe_calibration/coverage@20%": 0.3592069892473118, "train_probe_calibration/coverage@25%": 0.7552083333333334, "train_probe_calibration/coverage@30%": 0.8489583333333334, "train_probe_calibration/coverage@5%": 0.06821236559139786, "train_probe_calibration/ece": 0.1607358870967742, "train_probe_calibration/mean_confidence": 0.6693380376344087, "train_probe_completions/clipped_ratio": 0.011111111111111127, "train_probe_completions/max_length": 2180.3333333333335, "train_probe_completions/max_terminated_length": 2180.3333333333335, "train_probe_completions/mean_length": 696.967539469401, "train_probe_completions/mean_terminated_length": 704.8497619628906, "train_probe_completions/min_length": 0.0, "train_probe_completions/min_terminated_length": 256.8333333333333, "train_probe_loss": 0.0, "train_probe_num_tokens": 303546856.0, "train_probe_reward": 1.237586994965871, "train_probe_reward_std": 0.31869522233804065, "train_probe_rewards/accuracy_reward": 0.6892361144224802, "train_probe_rewards/brier_reward": 0.7937369843324026, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9921875099341074, "train_probe_rewards/mean_confidence_reward": 0.6606770853201548, "train_probe_runtime": 203.3777, "train_probe_samples_per_second": 4.917, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4159071147441864, "train_probe_signal/accuracy_reward/group_std_mean": 0.4619760662317276, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.2079535573720932, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.2079535573720932, "train_probe_signal/advantage_abs_mean": 0.2792310416698456, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2792310416698456, "train_probe_signal/advantage_pre_scale_std": 0.3166344265143077, "train_probe_signal/advantage_std": 0.3166344265143077, "train_probe_signal/brier_reward/centered_abs_mean": 0.1411424552400907, "train_probe_signal/brier_reward/group_std_mean": 0.17277308801809946, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07057122762004535, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07057122762004535, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.015028211598594984, "train_probe_signal/format_reward/group_std_mean": 0.041204764818151794, "train_probe_signal/format_reward/group_zero_std_frac": 0.7777778108914694, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.007514105799297492, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.007514105799297492, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.06911892257630825, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.09736241524418195, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.911891622015295e-07, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 6.911891622015295e-07, "train_probe_steps_per_second": 0.03 }, { "calibration/aurc": 0.2189705908055147, "calibration/batch_distribution_entropy": 0.5038569771593934, "calibration/confidence_entropy": 0.6203069388942695, "calibration/coverage@0%": 0.0026403212422949265, "calibration/coverage@1%": 0.0026403212422949265, "calibration/coverage@10%": 0.037092668326220955, "calibration/coverage@15%": 0.29890573138928406, "calibration/coverage@20%": 0.5354850938014344, "calibration/coverage@25%": 0.5943841278589143, "calibration/coverage@30%": 0.8706743995131809, "calibration/coverage@5%": 0.0026403212422949265, "calibration/ece": 0.09079290043263846, "calibration/mean_confidence": 0.6655687581224445, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011458333333333348, "completions/max_length": 3339.6, "completions/max_terminated_length": 3339.6, "completions/mean_length": 668.9455688476562, "completions/mean_terminated_length": 676.7902221679688, "completions/min_length": 0.0, "completions/min_terminated_length": 198.2, "epoch": 0.3719953500581243, "grad_norm": 0.0005948090692982078, "learning_rate": 3.725961538461539e-06, "loss": -0.011, "num_tokens": 314360821.0, "reward": 1.2574485301971436, "reward_std": 0.17675618529319764, "rewards/accuracy_reward": 0.7238715291023254, "rewards/brier_reward": 0.8025573015213012, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9884548425674439, "rewards/mean_confidence_reward": 0.6635850667953491, "signal/accuracy_reward/centered_abs_mean": 0.17650282084941865, "signal/accuracy_reward/group_std_mean": 0.22730407118797302, "signal/accuracy_reward/group_zero_std_frac": 0.3777777761220932, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08825141042470933, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08825141042470933, "signal/advantage_abs_mean": 0.1317974552512169, "signal/advantage_pre_scale_abs_mean": 0.1317974552512169, "signal/advantage_pre_scale_std": 0.22176082134246827, "signal/advantage_std": 0.22176082134246827, "signal/brier_reward/centered_abs_mean": 0.08348918706178665, "signal/brier_reward/group_std_mean": 0.11194718182086945, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.041744593530893326, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.041744593530893326, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.02004665844142437, "signal/format_reward/group_std_mean": 0.03744541220366955, "signal/format_reward/group_zero_std_frac": 0.8472222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.010023329220712186, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010023329220712186, "signal/mean_confidence_reward/centered_abs_mean": 0.055129934847354886, "signal/mean_confidence_reward/group_std_mean": 0.07638790011405945, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.512993539014133e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.512993539014133e-07, "step": 155 }, { "calibration/aurc": 0.2347298170386006, "calibration/batch_distribution_entropy": 0.5409215572722538, "calibration/confidence_entropy": 0.5984361247458366, "calibration/coverage@0%": 0.002641848451669053, "calibration/coverage@1%": 0.002641848451669053, "calibration/coverage@10%": 0.002641848451669053, "calibration/coverage@15%": 0.32948286027120355, "calibration/coverage@20%": 0.615625, "calibration/coverage@25%": 0.784375, "calibration/coverage@30%": 0.8, "calibration/coverage@5%": 0.002641848451669053, "calibration/ece": 0.1347699358862871, "calibration/mean_confidence": 0.6941470693786417, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015104166666666674, "completions/max_length": 3622.4, "completions/max_terminated_length": 3622.4, "completions/mean_length": 680.0411499023437, "completions/mean_terminated_length": 690.4038330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 208.0, "epoch": 0.38399520005999926, "grad_norm": 0.0006509215454570949, "learning_rate": 3.846153846153847e-06, "loss": -0.0147, "num_tokens": 325282191.0, "reward": 1.2107713222503662, "reward_std": 0.18045419752597808, "rewards/accuracy_reward": 0.6597222328186035, "rewards/brier_reward": 0.7769977211952209, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9848090291023255, "rewards/mean_confidence_reward": 0.6800069451332093, "signal/accuracy_reward/centered_abs_mean": 0.16652560532093047, "signal/accuracy_reward/group_std_mean": 0.22055523097515106, "signal/accuracy_reward/group_zero_std_frac": 0.3638888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08326280266046523, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08326280266046523, "signal/advantage_abs_mean": 0.1333080381155014, "signal/advantage_pre_scale_abs_mean": 0.1333080381155014, "signal/advantage_pre_scale_std": 0.22689380645751953, "signal/advantage_std": 0.22689380645751953, "signal/brier_reward/centered_abs_mean": 0.09306411743164063, "signal/brier_reward/group_std_mean": 0.12321935147047043, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.046532058715820314, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.046532058715820314, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.024723307229578495, "signal/format_reward/group_std_mean": 0.042361725121736526, "signal/format_reward/group_zero_std_frac": 0.8388888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012361653614789248, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012361653614789248, "signal/mean_confidence_reward/centered_abs_mean": 0.058087891340255736, "signal/mean_confidence_reward/group_std_mean": 0.07963319718837739, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.808788955619093e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.808788955619093e-07, "step": 160 }, { "calibration/aurc": 0.26699546088603016, "calibration/batch_distribution_entropy": 0.5694606991846488, "calibration/confidence_entropy": 0.6015419626212064, "calibration/coverage@0%": 0.0052567618125593995, "calibration/coverage@1%": 0.0052567618125593995, "calibration/coverage@10%": 0.16984009514589274, "calibration/coverage@15%": 0.3090018014950991, "calibration/coverage@20%": 0.44159184078755126, "calibration/coverage@25%": 0.5250853009566147, "calibration/coverage@30%": 0.5848443887420453, "calibration/coverage@5%": 0.0052567618125593995, "calibration/ece": 0.13983723108529078, "calibration/mean_confidence": 0.6822994662985316, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01640625, "completions/max_length": 3302.4, "completions/max_terminated_length": 3302.4, "completions/mean_length": 715.052783203125, "completions/mean_terminated_length": 727.1060791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 154.8, "epoch": 0.39599505006187424, "grad_norm": 0.0005810638540424407, "learning_rate": 3.966346153846154e-06, "loss": -0.0143, "num_tokens": 336658671.0, "reward": 1.2005762577056884, "reward_std": 0.17796370685100554, "rewards/accuracy_reward": 0.6429687380790711, "rewards/brier_reward": 0.7746632099151611, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9835069417953491, "rewards/mean_confidence_reward": 0.676325523853302, "signal/accuracy_reward/centered_abs_mean": 0.1633843332529068, "signal/accuracy_reward/group_std_mean": 0.2169993758201599, "signal/accuracy_reward/group_zero_std_frac": 0.38055555820465087, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0816921666264534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0816921666264534, "signal/advantage_abs_mean": 0.13168618083000183, "signal/advantage_pre_scale_abs_mean": 0.13168618083000183, "signal/advantage_pre_scale_std": 0.22412039041519166, "signal/advantage_std": 0.22412039041519166, "signal/brier_reward/centered_abs_mean": 0.0937480315566063, "signal/brier_reward/group_std_mean": 0.12342212647199631, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04687401577830315, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04687401577830315, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.024739583395421504, "signal/format_reward/group_std_mean": 0.04207911007106304, "signal/format_reward/group_zero_std_frac": 0.8361111283302307, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012369791697710752, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012369791697710752, "signal/mean_confidence_reward/centered_abs_mean": 0.06112988218665123, "signal/mean_confidence_reward/group_std_mean": 0.08158329874277115, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.112988330642111e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.112988330642111e-07, "step": 165 }, { "calibration/aurc": 0.18950245147490588, "calibration/batch_distribution_entropy": 0.4605112433303056, "calibration/confidence_entropy": 0.5873910288730165, "calibration/coverage@0%": 0.018113322142594045, "calibration/coverage@1%": 0.018113322142594045, "calibration/coverage@10%": 0.321328478397468, "calibration/coverage@15%": 0.5105694913769195, "calibration/coverage@20%": 0.6189877069655838, "calibration/coverage@25%": 0.6189877069655838, "calibration/coverage@30%": 0.7496963683829065, "calibration/coverage@5%": 0.018113322142594045, "calibration/ece": 0.11368899819896634, "calibration/mean_confidence": 0.7131175395667277, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009982638888888885, "completions/max_length": 3441.6, "completions/max_terminated_length": 3441.6, "completions/mean_length": 714.3421142578125, "completions/mean_terminated_length": 721.6017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 228.8, "epoch": 0.4079949000637492, "grad_norm": 0.0005395952030085027, "learning_rate": 4.086538461538462e-06, "loss": -0.0099, "num_tokens": 347977076.0, "reward": 1.2539068937301636, "reward_std": 0.1635847806930542, "rewards/accuracy_reward": 0.7121527791023254, "rewards/brier_reward": 0.8058033585548401, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.989843738079071, "rewards/mean_confidence_reward": 0.6932552218437195, "signal/accuracy_reward/centered_abs_mean": 0.1515733480453491, "signal/accuracy_reward/group_std_mean": 0.20517812371253968, "signal/accuracy_reward/group_zero_std_frac": 0.3944444417953491, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07578667402267455, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07578667402267455, "signal/advantage_abs_mean": 0.11742317229509354, "signal/advantage_pre_scale_abs_mean": 0.11742317229509354, "signal/advantage_pre_scale_std": 0.21010645627975463, "signal/advantage_std": 0.21010645627975463, "signal/brier_reward/centered_abs_mean": 0.08143201023340225, "signal/brier_reward/group_std_mean": 0.1106080636382103, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04071600511670113, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04071600511670113, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.017681206855922938, "signal/format_reward/group_std_mean": 0.035257697105407715, "signal/format_reward/group_zero_std_frac": 0.8472222208976745, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.008840603427961469, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.008840603427961469, "signal/mean_confidence_reward/centered_abs_mean": 0.04964464083313942, "signal/mean_confidence_reward/group_std_mean": 0.06889676898717881, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.964464210388542e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.964464210388542e-07, "step": 170 }, { "calibration/aurc": 0.17951174050774393, "calibration/batch_distribution_entropy": 0.46488215743109385, "calibration/confidence_entropy": 0.601026058831069, "calibration/coverage@0%": 0.0069566142265753365, "calibration/coverage@1%": 0.0069566142265753365, "calibration/coverage@10%": 0.15600810474148047, "calibration/coverage@15%": 0.28778098698643195, "calibration/coverage@20%": 0.7586586801594432, "calibration/coverage@25%": 0.8687135491331283, "calibration/coverage@30%": 0.9519788918205805, "calibration/coverage@5%": 0.0069566142265753365, "calibration/ece": 0.10965007030371851, "calibration/mean_confidence": 0.6886369931341909, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013715277777777746, "completions/max_length": 3696.8, "completions/max_terminated_length": 3696.8, "completions/mean_length": 797.1045288085937, "completions/mean_terminated_length": 808.1990356445312, "completions/min_length": 0.0, "completions/min_terminated_length": 235.0, "epoch": 0.4199947500656242, "grad_norm": 0.0005441954708658159, "learning_rate": 4.20673076923077e-06, "loss": -0.0124, "num_tokens": 360267688.0, "reward": 1.2437713146209717, "reward_std": 0.16432649791240692, "rewards/accuracy_reward": 0.7017361044883728, "rewards/brier_reward": 0.7997685313224793, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9860243082046509, "rewards/mean_confidence_reward": 0.6790468692779541, "signal/accuracy_reward/centered_abs_mean": 0.15365668535232543, "signal/accuracy_reward/group_std_mean": 0.20976330041885377, "signal/accuracy_reward/group_zero_std_frac": 0.37777777314186095, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07682834267616272, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07682834267616272, "signal/advantage_abs_mean": 0.118514883518219, "signal/advantage_pre_scale_abs_mean": 0.118514883518219, "signal/advantage_pre_scale_std": 0.2132311314344406, "signal/advantage_std": 0.2132311314344406, "signal/brier_reward/centered_abs_mean": 0.08158342689275741, "signal/brier_reward/group_std_mean": 0.10989912897348404, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.040791713446378705, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.040791713446378705, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.021305338107049467, "signal/format_reward/group_std_mean": 0.03604287914931774, "signal/format_reward/group_zero_std_frac": 0.8638888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.010652669053524733, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010652669053524733, "signal/mean_confidence_reward/centered_abs_mean": 0.05458425879478455, "signal/mean_confidence_reward/group_std_mean": 0.07332057952880859, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.458425675897161e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.458425675897161e-07, "step": 175 }, { "calibration/aurc": 0.13073198296441638, "calibration/batch_distribution_entropy": 0.5380331997105168, "calibration/confidence_entropy": 0.6189836117483226, "calibration/coverage@0%": 0.0031981811447374096, "calibration/coverage@1%": 0.0031981811447374096, "calibration/coverage@10%": 0.500512497426578, "calibration/coverage@15%": 0.7357418214341888, "calibration/coverage@20%": 0.8534599544952848, "calibration/coverage@25%": 0.96, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.0031981811447374096, "calibration/ece": 0.14052891097492723, "calibration/mean_confidence": 0.6583465784315436, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017100694444444443, "completions/max_length": 3781.6, "completions/max_terminated_length": 3781.6, "completions/mean_length": 803.7421997070312, "completions/mean_terminated_length": 817.6618408203125, "completions/min_length": 0.0, "completions/min_terminated_length": 267.0, "epoch": 0.4319946000674992, "grad_norm": 0.0005887884763069451, "learning_rate": 4.326923076923077e-06, "loss": -0.0164, "num_tokens": 372626766.0, "reward": 1.2421112537384034, "reward_std": 0.16366690397262573, "rewards/accuracy_reward": 0.7052083253860474, "rewards/brier_reward": 0.7963624358177185, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9826388835906983, "rewards/mean_confidence_reward": 0.6401279687881469, "signal/accuracy_reward/centered_abs_mean": 0.1526584208011627, "signal/accuracy_reward/group_std_mean": 0.2030440092086792, "signal/accuracy_reward/group_zero_std_frac": 0.4138888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07632921040058135, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07632921040058135, "signal/advantage_abs_mean": 0.11888949573040009, "signal/advantage_pre_scale_abs_mean": 0.11888949573040009, "signal/advantage_pre_scale_std": 0.21288825571537018, "signal/advantage_std": 0.21288825571537018, "signal/brier_reward/centered_abs_mean": 0.08247195929288864, "signal/brier_reward/group_std_mean": 0.1119322955608368, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04123597964644432, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04123597964644432, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.02655164934694767, "signal/format_reward/group_std_mean": 0.04704082012176514, "signal/format_reward/group_zero_std_frac": 0.8111111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.013275824673473834, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.013275824673473834, "signal/mean_confidence_reward/centered_abs_mean": 0.06838627606630325, "signal/mean_confidence_reward/group_std_mean": 0.08998300731182099, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.838627200522751e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.838627200522751e-07, "step": 180 }, { "calibration/aurc": 0.2164541607435649, "calibration/batch_distribution_entropy": 0.5584828013879158, "calibration/confidence_entropy": 0.6081135810902737, "calibration/coverage@0%": 0.008370128361106174, "calibration/coverage@1%": 0.008370128361106174, "calibration/coverage@10%": 0.008370128361106174, "calibration/coverage@15%": 0.08186094200940014, "calibration/coverage@20%": 0.5181000572883361, "calibration/coverage@25%": 0.6824490525071801, "calibration/coverage@30%": 0.9328083989501312, "calibration/coverage@5%": 0.008370128361106174, "calibration/ece": 0.08970610261963805, "calibration/mean_confidence": 0.6582180475085961, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014496527777777768, "completions/max_length": 3496.2, "completions/max_terminated_length": 3496.2, "completions/mean_length": 823.4496459960938, "completions/mean_terminated_length": 835.5283325195312, "completions/min_length": 0.0, "completions/min_terminated_length": 236.2, "epoch": 0.44399445006937416, "grad_norm": 0.0005881107645109296, "learning_rate": 4.447115384615385e-06, "loss": -0.0133, "num_tokens": 385202954.0, "reward": 1.2252756118774415, "reward_std": 0.1711643397808075, "rewards/accuracy_reward": 0.672656261920929, "rewards/brier_reward": 0.792465353012085, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9854166626930236, "rewards/mean_confidence_reward": 0.6442065834999084, "signal/accuracy_reward/centered_abs_mean": 0.17211913764476777, "signal/accuracy_reward/group_std_mean": 0.22131863236427307, "signal/accuracy_reward/group_zero_std_frac": 0.39166667461395266, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08605956882238389, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08605956882238389, "signal/advantage_abs_mean": 0.12793176472187043, "signal/advantage_pre_scale_abs_mean": 0.12793176472187043, "signal/advantage_pre_scale_std": 0.21473873853683473, "signal/advantage_std": 0.21473873853683473, "signal/brier_reward/centered_abs_mean": 0.0880697324872017, "signal/brier_reward/group_std_mean": 0.11771784871816635, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04403486624360085, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04403486624360085, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02277560755610466, "signal/format_reward/group_std_mean": 0.04086199067533016, "signal/format_reward/group_zero_std_frac": 0.8361111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01138780377805233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01138780377805233, "signal/mean_confidence_reward/centered_abs_mean": 0.06673773601651192, "signal/mean_confidence_reward/group_std_mean": 0.087367182970047, "signal/mean_confidence_reward/group_zero_std_frac": 0.002777777798473835, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.673773668808281e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.673773668808281e-07, "step": 185 }, { "calibration/aurc": 0.19728734203162568, "calibration/batch_distribution_entropy": 0.42199398548785316, "calibration/confidence_entropy": 0.5845906622754902, "calibration/coverage@0%": 0.017327626254374305, "calibration/coverage@1%": 0.017327626254374305, "calibration/coverage@10%": 0.1400430309541132, "calibration/coverage@15%": 0.3438181397443656, "calibration/coverage@20%": 0.4030501675946702, "calibration/coverage@25%": 0.916532313914513, "calibration/coverage@30%": 0.9842931937172775, "calibration/coverage@5%": 0.017327626254374305, "calibration/ece": 0.09412128524422647, "calibration/mean_confidence": 0.7056541262239604, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011024305555555558, "completions/max_length": 3308.0, "completions/max_terminated_length": 3308.0, "completions/mean_length": 846.0178100585938, "completions/mean_terminated_length": 855.4560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 259.0, "epoch": 0.45599430007124914, "grad_norm": 0.0005994663806632161, "learning_rate": 4.567307692307692e-06, "loss": -0.0124, "num_tokens": 398032023.0, "reward": 1.2700544834136962, "reward_std": 0.16538404524326325, "rewards/accuracy_reward": 0.73515625, "rewards/brier_reward": 0.8159632563591004, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889756917953492, "rewards/mean_confidence_reward": 0.6839498996734619, "signal/accuracy_reward/centered_abs_mean": 0.15601671040058135, "signal/accuracy_reward/group_std_mean": 0.21195557117462158, "signal/accuracy_reward/group_zero_std_frac": 0.36944444179534913, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07800835520029067, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07800835520029067, "signal/advantage_abs_mean": 0.11601104736328124, "signal/advantage_pre_scale_abs_mean": 0.11601104736328124, "signal/advantage_pre_scale_std": 0.21007709801197053, "signal/advantage_std": 0.21007709801197053, "signal/brier_reward/centered_abs_mean": 0.07907404452562332, "signal/brier_reward/group_std_mean": 0.1120298370718956, "signal/brier_reward/group_zero_std_frac": 0.03888888973742723, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03953702226281166, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03953702226281166, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.019710286520421505, "signal/format_reward/group_std_mean": 0.04124503955245018, "signal/format_reward/group_zero_std_frac": 0.8166666746139526, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.009855143260210752, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009855143260210752, "signal/mean_confidence_reward/centered_abs_mean": 0.05257853865623474, "signal/mean_confidence_reward/group_std_mean": 0.07405076920986176, "signal/mean_confidence_reward/group_zero_std_frac": 0.04722222257405519, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.257853899820475e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.257853899820475e-07, "step": 190 }, { "calibration/aurc": 0.21424680958258424, "calibration/batch_distribution_entropy": 0.43943662298268266, "calibration/confidence_entropy": 0.5796439546778498, "calibration/coverage@0%": 0.00954466437080675, "calibration/coverage@1%": 0.00954466437080675, "calibration/coverage@10%": 0.02106580908411445, "calibration/coverage@15%": 0.029240196005095375, "calibration/coverage@20%": 0.6949435959583006, "calibration/coverage@25%": 0.7684918472384412, "calibration/coverage@30%": 0.7891267678733619, "calibration/coverage@5%": 0.00954466437080675, "calibration/ece": 0.10948040300717918, "calibration/mean_confidence": 0.6926145009124871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01605902777777777, "completions/max_length": 3981.6, "completions/max_terminated_length": 3981.6, "completions/mean_length": 849.8381958007812, "completions/mean_terminated_length": 863.7781494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 245.4, "epoch": 0.46799415007312406, "grad_norm": 0.0005687425727955997, "learning_rate": 4.6875000000000004e-06, "loss": -0.0161, "num_tokens": 410903023.0, "reward": 1.2276915788650513, "reward_std": 0.1744602084159851, "rewards/accuracy_reward": 0.6784722208976746, "rewards/brier_reward": 0.7932164311408997, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9836805582046508, "rewards/mean_confidence_reward": 0.688956618309021, "signal/accuracy_reward/centered_abs_mean": 0.1576931431889534, "signal/accuracy_reward/group_std_mean": 0.2115387111902237, "signal/accuracy_reward/group_zero_std_frac": 0.3861111104488373, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0788465715944767, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0788465715944767, "signal/advantage_abs_mean": 0.12698804587125778, "signal/advantage_pre_scale_abs_mean": 0.12698804587125778, "signal/advantage_pre_scale_std": 0.22495968639850616, "signal/advantage_std": 0.22495968639850616, "signal/brier_reward/centered_abs_mean": 0.09262087643146515, "signal/brier_reward/group_std_mean": 0.12360085844993592, "signal/brier_reward/group_zero_std_frac": 0.05277777947485447, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04631043821573257, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04631043821573257, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02618272602558136, "signal/format_reward/group_std_mean": 0.047728189080953595, "signal/format_reward/group_zero_std_frac": 0.8027777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01309136301279068, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01309136301279068, "signal/mean_confidence_reward/centered_abs_mean": 0.05439404398202896, "signal/mean_confidence_reward/group_std_mean": 0.07680513560771943, "signal/mean_confidence_reward/group_zero_std_frac": 0.0777777798473835, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.439404276330606e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.439404276330606e-07, "step": 195 }, { "calibration/aurc": 0.18797913931053406, "calibration/batch_distribution_entropy": 0.4253448350583195, "calibration/confidence_entropy": 0.580372254075088, "calibration/coverage@0%": 0.016846997720838332, "calibration/coverage@1%": 0.016846997720838332, "calibration/coverage@10%": 0.1937748991169989, "calibration/coverage@15%": 0.35277359021647536, "calibration/coverage@20%": 0.5996445105820106, "calibration/coverage@25%": 0.6121445105820106, "calibration/coverage@30%": 0.9176421957671957, "calibration/coverage@5%": 0.016846997720838332, "calibration/ece": 0.12447102390884815, "calibration/mean_confidence": 0.7115793122777138, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01328125, "completions/max_length": 3513.8, "completions/max_terminated_length": 3513.8, "completions/mean_length": 813.2226806640625, "completions/mean_terminated_length": 824.192138671875, "completions/min_length": 0.0, "completions/min_terminated_length": 245.2, "epoch": 0.47999400007499904, "grad_norm": 0.0005493377102538943, "learning_rate": 4.807692307692308e-06, "loss": -0.0117, "num_tokens": 423339156.0, "reward": 1.236952781677246, "reward_std": 0.17222749590873718, "rewards/accuracy_reward": 0.6858507037162781, "rewards/brier_reward": 0.8013223528861999, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9867187380790711, "rewards/mean_confidence_reward": 0.6887404441833496, "signal/accuracy_reward/centered_abs_mean": 0.1634494334459305, "signal/accuracy_reward/group_std_mean": 0.21404653787612915, "signal/accuracy_reward/group_zero_std_frac": 0.39166666865348815, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08172471672296525, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08172471672296525, "signal/advantage_abs_mean": 0.1277629017829895, "signal/advantage_pre_scale_abs_mean": 0.1277629017829895, "signal/advantage_pre_scale_std": 0.22268297970294954, "signal/advantage_std": 0.22268297970294954, "signal/brier_reward/centered_abs_mean": 0.08835268914699554, "signal/brier_reward/group_std_mean": 0.11781696230173111, "signal/brier_reward/group_zero_std_frac": 0.030555556528270245, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04417634457349777, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04417634457349777, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.021468098647892474, "signal/format_reward/group_std_mean": 0.039810846000909804, "signal/format_reward/group_zero_std_frac": 0.8361111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.010734049323946237, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010734049323946237, "signal/mean_confidence_reward/centered_abs_mean": 0.053413412719964984, "signal/mean_confidence_reward/group_std_mean": 0.073988276720047, "signal/mean_confidence_reward/group_zero_std_frac": 0.05000000111758709, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.341341079656558e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.341341079656558e-07, "step": 200 }, { "epoch": 0.47999400007499904, "eval_calibration/aurc": 0.16475993612097564, "eval_calibration/batch_distribution_entropy": 0.5033301043560275, "eval_calibration/confidence_entropy": 0.586726742027942, "eval_calibration/coverage@0%": 0.03715277777777778, "eval_calibration/coverage@1%": 0.03715277777777778, "eval_calibration/coverage@10%": 0.13090277777777778, "eval_calibration/coverage@15%": 0.5701388888888889, "eval_calibration/coverage@20%": 0.8125, "eval_calibration/coverage@25%": 0.8854166666666666, "eval_calibration/coverage@30%": 0.96875, "eval_calibration/coverage@5%": 0.03715277777777778, "eval_calibration/ece": 0.15282986111111116, "eval_calibration/mean_confidence": 0.6935937500000001, "eval_completions/clipped_ratio": 0.019965277777777807, "eval_completions/max_length": 2217.5, "eval_completions/max_terminated_length": 2217.5, "eval_completions/mean_length": 856.4458618164062, "eval_completions/mean_terminated_length": 873.8812662760416, "eval_completions/min_length": 80.66666666666667, "eval_completions/min_terminated_length": 354.0, "eval_loss": 0.0, "eval_num_tokens": 423339156.0, "eval_reward": 1.219944993654887, "eval_reward_std": 0.35690300663312274, "eval_rewards/accuracy_reward": 0.6692708333333334, "eval_rewards/brier_reward": 0.7931749125321707, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9774305621782938, "eval_rewards/mean_confidence_reward": 0.676562507947286, "eval_runtime": 213.1522, "eval_samples_per_second": 4.691, "eval_signal/accuracy_reward/centered_abs_mean": 0.4264865467945735, "eval_signal/accuracy_reward/group_std_mean": 0.4677298466364543, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.21324327339728674, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21324327339728674, "eval_signal/advantage_abs_mean": 0.3075044999519984, "eval_signal/advantage_pre_scale_abs_mean": 0.3075044999519984, "eval_signal/advantage_pre_scale_std": 0.3551974693934123, "eval_signal/advantage_std": 0.3551974693934123, "eval_signal/brier_reward/centered_abs_mean": 0.17520237465699515, "eval_signal/brier_reward/group_std_mean": 0.21767246474822363, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08760118732849757, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.08760118732849757, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.04286024331425627, "eval_signal/format_reward/group_std_mean": 0.10621986196686824, "eval_signal/format_reward/group_zero_std_frac": 0.4722222338120143, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.021430121657128137, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.021430121657128137, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.10149739931027095, "eval_signal/mean_confidence_reward/group_std_mean": 0.14832059169809023, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.0149739561408448e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.0149739561408448e-06, "eval_steps_per_second": 0.028, "step": 200 }, { "epoch": 0.47999400007499904, "step": 200, "train_probe_calibration/aurc": 0.2369223376741071, "train_probe_calibration/batch_distribution_entropy": 0.4503578903961598, "train_probe_calibration/confidence_entropy": 0.5849798713599211, "train_probe_calibration/coverage@0%": 0.042013888888888885, "train_probe_calibration/coverage@1%": 0.042013888888888885, "train_probe_calibration/coverage@10%": 0.16180555555555556, "train_probe_calibration/coverage@15%": 0.3180555555555556, "train_probe_calibration/coverage@20%": 0.5472222222222222, "train_probe_calibration/coverage@25%": 0.5628472222222222, "train_probe_calibration/coverage@30%": 0.6045138888888889, "train_probe_calibration/coverage@5%": 0.042013888888888885, "train_probe_calibration/ece": 0.13258680555555555, "train_probe_calibration/mean_confidence": 0.6896354166666666, "train_probe_completions/clipped_ratio": 0.017881944444444447, "train_probe_completions/max_length": 2854.5, "train_probe_completions/max_terminated_length": 2854.5, "train_probe_completions/mean_length": 843.8245035807291, "train_probe_completions/mean_terminated_length": 858.919179280599, "train_probe_completions/min_length": 97.33333333333333, "train_probe_completions/min_terminated_length": 307.3333333333333, "train_probe_loss": 0.0, "train_probe_num_tokens": 423339156.0, "train_probe_reward": 1.249226729075114, "train_probe_reward_std": 0.32946378489335376, "train_probe_rewards/accuracy_reward": 0.7005208333333334, "train_probe_rewards/brier_reward": 0.8109396596749624, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9869791666666666, "train_probe_rewards/mean_confidence_reward": 0.6814670066038767, "train_probe_runtime": 195.8912, "train_probe_samples_per_second": 5.105, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4059787392616272, "train_probe_signal/accuracy_reward/group_std_mean": 0.4562051594257355, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.2029893696308136, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.2029893696308136, "train_probe_signal/advantage_abs_mean": 0.282023290793101, "train_probe_signal/advantage_pre_scale_abs_mean": 0.282023290793101, "train_probe_signal/advantage_pre_scale_std": 0.32872556646664935, "train_probe_signal/advantage_std": 0.32872556646664935, "train_probe_signal/brier_reward/centered_abs_mean": 0.1585026135047277, "train_probe_signal/brier_reward/group_std_mean": 0.1987637703617414, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07925130675236385, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07925130675236385, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.024685330223292112, "train_probe_signal/format_reward/group_std_mean": 0.061172984850903354, "train_probe_signal/format_reward/group_zero_std_frac": 0.694444457689921, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.012342665111646056, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.012342665111646056, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.09747721254825592, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.1409267522394657, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 9.747721113247583e-07, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 9.747721113247583e-07, "train_probe_steps_per_second": 0.031 }, { "calibration/aurc": 0.18859796481820973, "calibration/batch_distribution_entropy": 0.49016137039685514, "calibration/confidence_entropy": 0.5832531739332502, "calibration/coverage@0%": 0.02528923342885303, "calibration/coverage@1%": 0.02528923342885303, "calibration/coverage@10%": 0.04051381320289547, "calibration/coverage@15%": 0.3348707685834729, "calibration/coverage@20%": 0.6121917469644648, "calibration/coverage@25%": 0.7104962682410605, "calibration/coverage@30%": 0.9425866102261334, "calibration/coverage@5%": 0.02528923342885303, "calibration/ece": 0.08907421082183815, "calibration/mean_confidence": 0.7008540438864228, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017534722222222233, "completions/max_length": 3608.0, "completions/max_terminated_length": 3608.0, "completions/mean_length": 869.8250122070312, "completions/mean_terminated_length": 885.385205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 240.6, "epoch": 0.491993850076874, "grad_norm": 0.0004947424167767167, "learning_rate": 4.927884615384616e-06, "loss": -0.0165, "num_tokens": 436425492.0, "reward": 1.2683157205581665, "reward_std": 0.17741301357746125, "rewards/accuracy_reward": 0.7374131917953491, "rewards/brier_reward": 0.8171733140945434, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.98203125, "rewards/mean_confidence_reward": 0.6796643614768982, "signal/accuracy_reward/centered_abs_mean": 0.15825195014476776, "signal/accuracy_reward/group_std_mean": 0.2160105437040329, "signal/accuracy_reward/group_zero_std_frac": 0.3583333432674408, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07912597507238388, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07912597507238388, "signal/advantage_abs_mean": 0.12522561550140382, "signal/advantage_pre_scale_abs_mean": 0.12522561550140382, "signal/advantage_pre_scale_std": 0.22589523792266847, "signal/advantage_std": 0.22589523792266847, "signal/brier_reward/centered_abs_mean": 0.08916214108467102, "signal/brier_reward/group_std_mean": 0.12365642189979553, "signal/brier_reward/group_zero_std_frac": 0.002777777798473835, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04458107054233551, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04458107054233551, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.02871636264026165, "signal/format_reward/group_std_mean": 0.051889467239379886, "signal/format_reward/group_zero_std_frac": 0.7916666865348816, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.014358181320130826, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.014358181320130826, "signal/mean_confidence_reward/centered_abs_mean": 0.056642889976501465, "signal/mean_confidence_reward/group_std_mean": 0.08064212948083878, "signal/mean_confidence_reward/group_zero_std_frac": 0.002777777798473835, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.664289005835599e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.664289005835599e-07, "step": 205 }, { "calibration/aurc": 0.1506776427728051, "calibration/batch_distribution_entropy": 0.4722197449578175, "calibration/confidence_entropy": 0.5859295651237862, "calibration/coverage@0%": 0.018844066545459514, "calibration/coverage@1%": 0.018844066545459514, "calibration/coverage@10%": 0.3256503492679726, "calibration/coverage@15%": 0.4476788772853186, "calibration/coverage@20%": 0.7566187516950194, "calibration/coverage@25%": 0.9565283894389296, "calibration/coverage@30%": 0.9832898172323759, "calibration/coverage@5%": 0.018844066545459514, "calibration/ece": 0.11155852682321177, "calibration/mean_confidence": 0.7071811200298685, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01684027777777779, "completions/max_length": 3595.2, "completions/max_terminated_length": 3595.2, "completions/mean_length": 944.5974243164062, "completions/mean_terminated_length": 960.9596557617188, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.503993700078749, "grad_norm": 0.00048178460565395653, "learning_rate": 4.987980769230769e-06, "loss": -0.0166, "num_tokens": 450423094.0, "reward": 1.2403717279434203, "reward_std": 0.17440281212329864, "rewards/accuracy_reward": 0.69375, "rewards/brier_reward": 0.8039070248603821, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9830729126930237, "rewards/mean_confidence_reward": 0.6742930769920349, "signal/accuracy_reward/centered_abs_mean": 0.15308159738779067, "signal/accuracy_reward/group_std_mean": 0.20622264146804808, "signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07654079869389534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07654079869389534, "signal/advantage_abs_mean": 0.1251235693693161, "signal/advantage_pre_scale_abs_mean": 0.1251235693693161, "signal/advantage_pre_scale_std": 0.2240269511938095, "signal/advantage_std": 0.2240269511938095, "signal/brier_reward/centered_abs_mean": 0.09351787120103836, "signal/brier_reward/group_std_mean": 0.1259806677699089, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04675893560051918, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04675893560051918, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.02883572019636631, "signal/format_reward/group_std_mean": 0.0537541963160038, "signal/format_reward/group_zero_std_frac": 0.7805555582046508, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.014417860098183155, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.014417860098183155, "signal/mean_confidence_reward/centered_abs_mean": 0.06149525865912438, "signal/mean_confidence_reward/group_std_mean": 0.08609340637922287, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.149525916043786e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.149525916043786e-07, "step": 210 }, { "calibration/aurc": 0.1659387279855064, "calibration/batch_distribution_entropy": 0.4373021104798906, "calibration/confidence_entropy": 0.581071101932245, "calibration/coverage@0%": 0.004745196093329096, "calibration/coverage@1%": 0.004745196093329096, "calibration/coverage@10%": 0.5091125438103068, "calibration/coverage@15%": 0.5780162404626946, "calibration/coverage@20%": 0.5988980716253444, "calibration/coverage@25%": 0.8, "calibration/coverage@30%": 0.8, "calibration/coverage@5%": 0.133505090552432, "calibration/ece": 0.14208133406939083, "calibration/mean_confidence": 0.7116783502169329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021267361111111115, "completions/max_length": 3722.2, "completions/max_terminated_length": 3722.2, "completions/mean_length": 1055.9399536132812, "completions/mean_terminated_length": 1079.0013671875, "completions/min_length": 0.0, "completions/min_terminated_length": 359.8, "epoch": 0.515993550080624, "grad_norm": 0.001461154199205339, "learning_rate": 4.957932692307692e-06, "loss": -0.0202, "num_tokens": 465666338.0, "reward": 1.2412683010101317, "reward_std": 0.18140681982040405, "rewards/accuracy_reward": 0.69765625, "rewards/brier_reward": 0.8069155216217041, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9779513716697693, "rewards/mean_confidence_reward": 0.6686267375946044, "signal/accuracy_reward/centered_abs_mean": 0.15249023139476775, "signal/accuracy_reward/group_std_mean": 0.208032289147377, "signal/accuracy_reward/group_zero_std_frac": 0.3833333432674408, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07624511569738388, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07624511569738388, "signal/advantage_abs_mean": 0.12781576663255692, "signal/advantage_pre_scale_abs_mean": 0.12781576663255692, "signal/advantage_pre_scale_std": 0.23086531460285187, "signal/advantage_std": 0.23086531460285187, "signal/brier_reward/centered_abs_mean": 0.09337214976549149, "signal/brier_reward/group_std_mean": 0.12840781062841417, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.046686074882745746, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.046686074882745746, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0362196184694767, "signal/format_reward/group_std_mean": 0.06350894495844842, "signal/format_reward/group_zero_std_frac": 0.7527777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01810980923473835, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01810980923473835, "signal/mean_confidence_reward/centered_abs_mean": 0.06325759962201119, "signal/mean_confidence_reward/group_std_mean": 0.08950979113578797, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.32575950021419e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.32575950021419e-07, "step": 215 }, { "calibration/aurc": 0.16099535865595238, "calibration/batch_distribution_entropy": 0.6229471132590328, "calibration/confidence_entropy": 0.5991021466889653, "calibration/coverage@0%": 0.02480530355189756, "calibration/coverage@1%": 0.02480530355189756, "calibration/coverage@10%": 0.2671506461697198, "calibration/coverage@15%": 0.5760352331496594, "calibration/coverage@20%": 0.661048857127861, "calibration/coverage@25%": 0.8254563697031368, "calibration/coverage@30%": 0.9188579456660207, "calibration/coverage@5%": 0.02480530355189756, "calibration/ece": 0.13982868258235198, "calibration/mean_confidence": 0.6527101583086952, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02430555555555556, "completions/max_length": 3868.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 1099.1880126953124, "completions/mean_terminated_length": 1126.547021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 372.8, "epoch": 0.527993400082499, "grad_norm": 0.0008888631709851325, "learning_rate": 4.927884615384616e-06, "loss": -0.0263, "num_tokens": 481414616.0, "reward": 1.2510359287261963, "reward_std": 0.17950492799282075, "rewards/accuracy_reward": 0.7162326335906982, "rewards/brier_reward": 0.8108263969421386, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.975, "rewards/mean_confidence_reward": 0.6381267547607422, "signal/accuracy_reward/centered_abs_mean": 0.15162217915058135, "signal/accuracy_reward/group_std_mean": 0.20218829214572906, "signal/accuracy_reward/group_zero_std_frac": 0.4138888895511627, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07581108957529067, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07581108957529067, "signal/advantage_abs_mean": 0.12679895013570786, "signal/advantage_pre_scale_abs_mean": 0.12679895013570786, "signal/advantage_pre_scale_std": 0.23299736976623536, "signal/advantage_std": 0.23299736976623536, "signal/brier_reward/centered_abs_mean": 0.09415805339813232, "signal/brier_reward/group_std_mean": 0.12945906221866607, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04707902669906616, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04707902669906616, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.04190538227558136, "signal/format_reward/group_std_mean": 0.07224037051200867, "signal/format_reward/group_zero_std_frac": 0.7277777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.02095269113779068, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.02095269113779068, "signal/mean_confidence_reward/centered_abs_mean": 0.07220215201377869, "signal/mean_confidence_reward/group_std_mean": 0.09774433076381683, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.220214911285439e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.220214911285439e-07, "step": 220 }, { "calibration/aurc": 0.18961739258231108, "calibration/batch_distribution_entropy": 0.6513951806022693, "calibration/confidence_entropy": 0.5939247079780576, "calibration/coverage@0%": 0.007376775040697519, "calibration/coverage@1%": 0.007376775040697519, "calibration/coverage@10%": 0.1693476264848197, "calibration/coverage@15%": 0.38964650594199635, "calibration/coverage@20%": 0.5653257943041693, "calibration/coverage@25%": 0.8109313646377944, "calibration/coverage@30%": 0.8617457034871283, "calibration/coverage@5%": 0.14257597294151264, "calibration/ece": 0.09497116064688986, "calibration/mean_confidence": 0.6355094966373105, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02144097222222221, "completions/max_length": 3731.6, "completions/max_terminated_length": 3731.6, "completions/mean_length": 1088.8068725585938, "completions/mean_terminated_length": 1112.841162109375, "completions/min_length": 0.0, "completions/min_terminated_length": 359.0, "epoch": 0.5399932500843739, "grad_norm": 0.0007127965800464153, "learning_rate": 4.897836538461539e-06, "loss": -0.0187, "num_tokens": 497085863.0, "reward": 1.240068292617798, "reward_std": 0.170024636387825, "rewards/accuracy_reward": 0.69296875, "rewards/brier_reward": 0.8095510244369507, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9776041746139527, "rewards/mean_confidence_reward": 0.6299262166023254, "signal/accuracy_reward/centered_abs_mean": 0.15481227934360503, "signal/accuracy_reward/group_std_mean": 0.20429127514362336, "signal/accuracy_reward/group_zero_std_frac": 0.41666666865348817, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07740613967180252, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07740613967180252, "signal/advantage_abs_mean": 0.12212297022342682, "signal/advantage_pre_scale_abs_mean": 0.12212297022342682, "signal/advantage_pre_scale_std": 0.2249716341495514, "signal/advantage_std": 0.2249716341495514, "signal/brier_reward/centered_abs_mean": 0.09715218842029572, "signal/brier_reward/group_std_mean": 0.12982369661331178, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04857609421014786, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04857609421014786, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.03454861082136631, "signal/format_reward/group_std_mean": 0.058359884470701215, "signal/format_reward/group_zero_std_frac": 0.7805555582046508, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.017274305410683154, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.017274305410683154, "signal/mean_confidence_reward/centered_abs_mean": 0.07715874463319779, "signal/mean_confidence_reward/group_std_mean": 0.10222338140010834, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.715874630775943e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.715874630775943e-07, "step": 225 }, { "calibration/aurc": 0.18557891468016743, "calibration/batch_distribution_entropy": 0.5980522805170089, "calibration/confidence_entropy": 0.5508420204565233, "calibration/coverage@0%": 0.017455123316432057, "calibration/coverage@1%": 0.017455123316432057, "calibration/coverage@10%": 0.15915344893181368, "calibration/coverage@15%": 0.5841507246661936, "calibration/coverage@20%": 0.7673713419934158, "calibration/coverage@25%": 0.8241286863270778, "calibration/coverage@30%": 0.8241286863270778, "calibration/coverage@5%": 0.13362153403819668, "calibration/ece": 0.10804776036088097, "calibration/mean_confidence": 0.7012843236626433, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016927083333333325, "completions/max_length": 3369.6, "completions/max_terminated_length": 3369.6, "completions/mean_length": 935.3328369140625, "completions/mean_terminated_length": 951.2467163085937, "completions/min_length": 0.0, "completions/min_terminated_length": 321.6, "epoch": 0.5519931000862489, "grad_norm": 0.0006603916408494115, "learning_rate": 4.867788461538462e-06, "loss": -0.0127, "num_tokens": 510941665.0, "reward": 1.248283362388611, "reward_std": 0.16216517090797425, "rewards/accuracy_reward": 0.697743046283722, "rewards/brier_reward": 0.8163448214530945, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9824652910232544, "rewards/mean_confidence_reward": 0.6748263955116272, "signal/accuracy_reward/centered_abs_mean": 0.14817708134651184, "signal/accuracy_reward/group_std_mean": 0.19641394019126893, "signal/accuracy_reward/group_zero_std_frac": 0.44166666865348814, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07408854067325592, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07408854067325592, "signal/advantage_abs_mean": 0.11661250591278076, "signal/advantage_pre_scale_abs_mean": 0.11661250591278076, "signal/advantage_pre_scale_std": 0.21369268000125885, "signal/advantage_std": 0.21369268000125885, "signal/brier_reward/centered_abs_mean": 0.09563084691762924, "signal/brier_reward/group_std_mean": 0.1277540296316147, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04781542345881462, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04781542345881462, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02612847201526165, "signal/format_reward/group_std_mean": 0.04846450574696064, "signal/format_reward/group_zero_std_frac": 0.8027777791023254, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.013064236007630825, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.013064236007630825, "signal/mean_confidence_reward/centered_abs_mean": 0.07778537422418594, "signal/mean_confidence_reward/group_std_mean": 0.10394444912672043, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.778537451486045e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.778537451486045e-07, "step": 230 }, { "calibration/aurc": 0.2267518341214403, "calibration/batch_distribution_entropy": 0.7170006877243373, "calibration/confidence_entropy": 0.5349955577960062, "calibration/coverage@0%": 0.01726345266298101, "calibration/coverage@1%": 0.06321645527394706, "calibration/coverage@10%": 0.31590538841007565, "calibration/coverage@15%": 0.41704757554377, "calibration/coverage@20%": 0.47848630787473245, "calibration/coverage@25%": 0.508165058243138, "calibration/coverage@30%": 0.6892811143841154, "calibration/coverage@5%": 0.12865682205581022, "calibration/ece": 0.11967978879843208, "calibration/mean_confidence": 0.6819535404952625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018576388888888885, "completions/max_length": 3613.4, "completions/max_terminated_length": 3613.4, "completions/mean_length": 841.412158203125, "completions/mean_terminated_length": 857.3813232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 264.8, "epoch": 0.5639929500881239, "grad_norm": 0.0005925025325268507, "learning_rate": 4.837740384615385e-06, "loss": -0.019, "num_tokens": 523725325.0, "reward": 1.2389411926269531, "reward_std": 0.18443656861782073, "rewards/accuracy_reward": 0.6816840171813965, "rewards/brier_reward": 0.8149347066879272, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98125, "rewards/mean_confidence_reward": 0.6778428673744201, "signal/accuracy_reward/centered_abs_mean": 0.16112738847732544, "signal/accuracy_reward/group_std_mean": 0.21563574969768523, "signal/accuracy_reward/group_zero_std_frac": 0.37500001192092897, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08056369423866272, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08056369423866272, "signal/advantage_abs_mean": 0.12990931272506714, "signal/advantage_pre_scale_abs_mean": 0.12990931272506714, "signal/advantage_pre_scale_std": 0.2296265959739685, "signal/advantage_std": 0.2296265959739685, "signal/brier_reward/centered_abs_mean": 0.11121913492679596, "signal/brier_reward/group_std_mean": 0.14849260151386262, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.05560956746339798, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.05560956746339798, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.03297526016831398, "signal/format_reward/group_std_mean": 0.06269410699605941, "signal/format_reward/group_zero_std_frac": 0.7472222328186036, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01648763008415699, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01648763008415699, "signal/mean_confidence_reward/centered_abs_mean": 0.09073513895273208, "signal/mean_confidence_reward/group_std_mean": 0.11894857585430145, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 9.073513297153113e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 9.073513297153113e-07, "step": 235 }, { "calibration/aurc": 0.18809772241753855, "calibration/batch_distribution_entropy": 0.6919820653704276, "calibration/confidence_entropy": 0.547615475895375, "calibration/coverage@0%": 0.01933543233422539, "calibration/coverage@1%": 0.01933543233422539, "calibration/coverage@10%": 0.17496092407301372, "calibration/coverage@15%": 0.37692750228430255, "calibration/coverage@20%": 0.6804554431067951, "calibration/coverage@25%": 0.7585443516352972, "calibration/coverage@30%": 0.8845003933910307, "calibration/coverage@5%": 0.029806636522707063, "calibration/ece": 0.09869674887698114, "calibration/mean_confidence": 0.6644477963189794, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03394097222222221, "completions/max_length": 3487.4, "completions/max_terminated_length": 3487.4, "completions/mean_length": 806.119970703125, "completions/mean_terminated_length": 834.1783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 238.4, "epoch": 0.5759928000899989, "grad_norm": 0.000655407493468374, "learning_rate": 4.807692307692308e-06, "loss": -0.0327, "num_tokens": 536135315.0, "reward": 1.2350862979888917, "reward_std": 0.189765664935112, "rewards/accuracy_reward": 0.6988715291023254, "rewards/brier_reward": 0.8052286267280578, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9660590291023254, "rewards/mean_confidence_reward": 0.6683142185211182, "signal/accuracy_reward/centered_abs_mean": 0.14974500834941865, "signal/accuracy_reward/group_std_mean": 0.20264498591423036, "signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07487250417470932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07487250417470932, "signal/advantage_abs_mean": 0.13645783364772796, "signal/advantage_pre_scale_abs_mean": 0.13645783364772796, "signal/advantage_pre_scale_std": 0.24560058116912842, "signal/advantage_std": 0.24560058116912842, "signal/brier_reward/centered_abs_mean": 0.11757578700780869, "signal/brier_reward/group_std_mean": 0.15538268089294432, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.05878789350390434, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.05878789350390434, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0498752161860466, "signal/format_reward/group_std_mean": 0.07907630801200867, "signal/format_reward/group_zero_std_frac": 0.7222222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0249376080930233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0249376080930233, "signal/mean_confidence_reward/centered_abs_mean": 0.09250954985618591, "signal/mean_confidence_reward/group_std_mean": 0.12215169221162796, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 9.250954121853283e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 9.250954121853283e-07, "step": 240 }, { "calibration/aurc": 0.21502708332676299, "calibration/batch_distribution_entropy": 0.6736972632561764, "calibration/confidence_entropy": 0.5294996693262723, "calibration/coverage@0%": 0.009336167919274187, "calibration/coverage@1%": 0.009336167919274187, "calibration/coverage@10%": 0.17180995999768103, "calibration/coverage@15%": 0.34889173380970007, "calibration/coverage@20%": 0.5531000030673423, "calibration/coverage@25%": 0.675086784909768, "calibration/coverage@30%": 0.7896429300409884, "calibration/coverage@5%": 0.009336167919274187, "calibration/ece": 0.13010139567753987, "calibration/mean_confidence": 0.7054667984448812, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03993055555555556, "completions/max_length": 3611.2, "completions/max_terminated_length": 3611.2, "completions/mean_length": 732.729345703125, "completions/mean_terminated_length": 763.1750366210938, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.5879926500918738, "grad_norm": 0.0006918559083715081, "learning_rate": 4.777644230769231e-06, "loss": -0.0404, "num_tokens": 547671429.0, "reward": 1.2193737983703614, "reward_std": 0.2111702412366867, "rewards/accuracy_reward": 0.6796006917953491, "rewards/brier_reward": 0.7990634799003601, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9600694417953491, "rewards/mean_confidence_reward": 0.6946119785308837, "signal/accuracy_reward/centered_abs_mean": 0.1627875417470932, "signal/accuracy_reward/group_std_mean": 0.21149836480617523, "signal/accuracy_reward/group_zero_std_frac": 0.4138889014720917, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0813937708735466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0813937708735466, "signal/advantage_abs_mean": 0.15556576550006868, "signal/advantage_pre_scale_abs_mean": 0.15556576550006868, "signal/advantage_pre_scale_std": 0.26823673248291013, "signal/advantage_std": 0.26823673248291013, "signal/brier_reward/centered_abs_mean": 0.12417666465044022, "signal/brier_reward/group_std_mean": 0.16311552822589875, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.06208833232522011, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.06208833232522011, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.05979817733168602, "signal/format_reward/group_std_mean": 0.09682896584272385, "signal/format_reward/group_zero_std_frac": 0.6500000119209289, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.02989908866584301, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.02989908866584301, "signal/mean_confidence_reward/centered_abs_mean": 0.08894124478101731, "signal/mean_confidence_reward/group_std_mean": 0.12157819718122483, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.894124221114908e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.894124221114908e-07, "step": 245 }, { "calibration/aurc": 0.19849611516365956, "calibration/batch_distribution_entropy": 0.6118099685591544, "calibration/confidence_entropy": 0.518623467281972, "calibration/coverage@0%": 0.008552537922866355, "calibration/coverage@1%": 0.008552537922866355, "calibration/coverage@10%": 0.1878058370618676, "calibration/coverage@15%": 0.4937486617141806, "calibration/coverage@20%": 0.579915168287563, "calibration/coverage@25%": 0.6195504423338132, "calibration/coverage@30%": 0.7882782358691277, "calibration/coverage@5%": 0.008552537922866355, "calibration/ece": 0.10358029221958338, "calibration/mean_confidence": 0.7192545074604272, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03411458333333335, "completions/max_length": 3308.2, "completions/max_terminated_length": 3308.2, "completions/mean_length": 720.272998046875, "completions/mean_terminated_length": 745.5419311523438, "completions/min_length": 0.0, "completions/min_terminated_length": 208.4, "epoch": 0.5999925000937488, "grad_norm": 0.0005598870920948684, "learning_rate": 4.747596153846154e-06, "loss": -0.0338, "num_tokens": 559072974.0, "reward": 1.2204241752624512, "reward_std": 0.19821811318397523, "rewards/accuracy_reward": 0.6730902671813965, "rewards/brier_reward": 0.8018585801124573, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9658854126930236, "rewards/mean_confidence_reward": 0.6962265849113465, "signal/accuracy_reward/centered_abs_mean": 0.14829643964767455, "signal/accuracy_reward/group_std_mean": 0.2052748680114746, "signal/accuracy_reward/group_zero_std_frac": 0.37777777910232546, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07414821982383728, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07414821982383728, "signal/advantage_abs_mean": 0.13989384472370148, "signal/advantage_pre_scale_abs_mean": 0.13989384472370148, "signal/advantage_pre_scale_std": 0.24792618155479432, "signal/advantage_std": 0.24792618155479432, "signal/brier_reward/centered_abs_mean": 0.11858828216791154, "signal/brier_reward/group_std_mean": 0.1588190108537674, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.05929414108395577, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.05929414108395577, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.04991862066090107, "signal/format_reward/group_std_mean": 0.08470256775617599, "signal/format_reward/group_zero_std_frac": 0.6805555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.024959310330450533, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.024959310330450533, "signal/mean_confidence_reward/centered_abs_mean": 0.08715028315782547, "signal/mean_confidence_reward/group_std_mean": 0.11766181737184525, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.715028343431186e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.715028343431186e-07, "step": 250 }, { "epoch": 0.5999925000937488, "eval_calibration/aurc": 0.14847039058334097, "eval_calibration/batch_distribution_entropy": 0.5532074997412568, "eval_calibration/confidence_entropy": 0.5117870126264737, "eval_calibration/coverage@0%": 0.0991263440860215, "eval_calibration/coverage@1%": 0.0991263440860215, "eval_calibration/coverage@10%": 0.46875, "eval_calibration/coverage@15%": 0.5520833333333334, "eval_calibration/coverage@20%": 0.7291666666666666, "eval_calibration/coverage@25%": 0.765625, "eval_calibration/coverage@30%": 0.8333333333333334, "eval_calibration/coverage@5%": 0.24428763440860216, "eval_calibration/ece": 0.16076108870967745, "eval_calibration/mean_confidence": 0.730015120967742, "eval_completions/clipped_ratio": 0.018229166666666668, "eval_completions/max_length": 2105.3333333333335, "eval_completions/max_terminated_length": 2105.3333333333335, "eval_completions/mean_length": 715.6415100097656, "eval_completions/mean_terminated_length": 729.0237528483073, "eval_completions/min_length": 51.0, "eval_completions/min_terminated_length": 260.6666666666667, "eval_loss": 0.0, "eval_num_tokens": 559072974.0, "eval_reward": 1.229289968808492, "eval_reward_std": 0.35854795078436535, "eval_rewards/accuracy_reward": 0.676215281089147, "eval_rewards/brier_reward": 0.8040521244208018, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9782986144224802, "eval_rewards/mean_confidence_reward": 0.6897673606872559, "eval_runtime": 192.6668, "eval_samples_per_second": 5.19, "eval_signal/accuracy_reward/centered_abs_mean": 0.4197591145833333, "eval_signal/accuracy_reward/group_std_mean": 0.4634587913751602, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.20987955729166666, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.20987955729166666, "eval_signal/advantage_abs_mean": 0.302593136827151, "eval_signal/advantage_pre_scale_abs_mean": 0.302593136827151, "eval_signal/advantage_pre_scale_std": 0.35729822516441345, "eval_signal/advantage_std": 0.35729822516441345, "eval_signal/brier_reward/centered_abs_mean": 0.19831266005833945, "eval_signal/brier_reward/group_std_mean": 0.24753761291503906, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09915633002916972, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.09915633002916972, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.041286892568071686, "eval_signal/format_reward/group_std_mean": 0.10429880519707997, "eval_signal/format_reward/group_zero_std_frac": 0.4722222238779068, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.020643446284035843, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.020643446284035843, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.16337869564692178, "eval_signal/mean_confidence_reward/group_std_mean": 0.21260220805803934, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.6337868942173372e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.6337868942173372e-06, "eval_steps_per_second": 0.031, "step": 250 }, { "epoch": 0.5999925000937488, "step": 250, "train_probe_calibration/aurc": 0.19034182445887202, "train_probe_calibration/batch_distribution_entropy": 0.6060689311243653, "train_probe_calibration/confidence_entropy": 0.5227177257355492, "train_probe_calibration/coverage@0%": 0.12701612903225806, "train_probe_calibration/coverage@1%": 0.12701612903225806, "train_probe_calibration/coverage@10%": 0.23303091397849462, "train_probe_calibration/coverage@15%": 0.2760416666666667, "train_probe_calibration/coverage@20%": 0.5282258064516129, "train_probe_calibration/coverage@25%": 0.747143817204301, "train_probe_calibration/coverage@30%": 0.8953293010752689, "train_probe_calibration/coverage@5%": 0.12701612903225806, "train_probe_calibration/ece": 0.11350806451612905, "train_probe_calibration/mean_confidence": 0.7103494623655915, "train_probe_completions/clipped_ratio": 0.016493055555555542, "train_probe_completions/max_length": 2470.1666666666665, "train_probe_completions/max_terminated_length": 2470.1666666666665, "train_probe_completions/mean_length": 718.718760172526, "train_probe_completions/mean_terminated_length": 730.877939860026, "train_probe_completions/min_length": 38.666666666666664, "train_probe_completions/min_terminated_length": 231.33333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 559072974.0, "train_probe_reward": 1.2542650898297627, "train_probe_reward_std": 0.3470543523629506, "train_probe_rewards/accuracy_reward": 0.7118055522441864, "train_probe_rewards/brier_reward": 0.8166758815447489, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.980034718910853, "train_probe_rewards/mean_confidence_reward": 0.6982812583446503, "train_probe_runtime": 209.892, "train_probe_samples_per_second": 4.764, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4011501719554265, "train_probe_signal/accuracy_reward/group_std_mean": 0.4534666786591212, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.20057508597771326, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.20057508597771326, "train_probe_signal/advantage_abs_mean": 0.28758982320626575, "train_probe_signal/advantage_pre_scale_abs_mean": 0.28758982320626575, "train_probe_signal/advantage_pre_scale_std": 0.3458290745814641, "train_probe_signal/advantage_std": 0.3458290745814641, "train_probe_signal/brier_reward/centered_abs_mean": 0.18627643833557764, "train_probe_signal/brier_reward/group_std_mean": 0.23727227499087652, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09313821916778882, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.09313821916778882, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.037923176462451615, "train_probe_signal/format_reward/group_std_mean": 0.09964303423961003, "train_probe_signal/format_reward/group_zero_std_frac": 0.4722222338120143, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.018961588231225807, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.018961588231225807, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.1545372207959493, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.20506344735622406, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.5453721857738856e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.5453721857738856e-06, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.15655267494577998, "calibration/batch_distribution_entropy": 0.6326617500644323, "calibration/confidence_entropy": 0.5328678825386778, "calibration/coverage@0%": 0.006850682370916775, "calibration/coverage@1%": 0.006850682370916775, "calibration/coverage@10%": 0.45753152807118624, "calibration/coverage@15%": 0.5668117704799095, "calibration/coverage@20%": 0.6490574452679432, "calibration/coverage@25%": 0.693612281672555, "calibration/coverage@30%": 0.9135630403923087, "calibration/coverage@5%": 0.27735034666707986, "calibration/ece": 0.1502316103337395, "calibration/mean_confidence": 0.6769871838283059, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01180555555555558, "completions/max_length": 2908.8, "completions/max_terminated_length": 2908.8, "completions/mean_length": 735.1975708007812, "completions/mean_terminated_length": 744.0412963867187, "completions/min_length": 0.0, "completions/min_terminated_length": 212.8, "epoch": 0.6119923500956238, "grad_norm": 0.0005592016386799514, "learning_rate": 4.7175480769230775e-06, "loss": -0.013, "num_tokens": 570639858.0, "reward": 1.2332837581634521, "reward_std": 0.1590871512889862, "rewards/accuracy_reward": 0.6743923664093018, "rewards/brier_reward": 0.8039672613143921, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9881944417953491, "rewards/mean_confidence_reward": 0.6666790962219238, "signal/accuracy_reward/centered_abs_mean": 0.1418674036860466, "signal/accuracy_reward/group_std_mean": 0.1933798998594284, "signal/accuracy_reward/group_zero_std_frac": 0.43055555820465086, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0709337018430233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0709337018430233, "signal/advantage_abs_mean": 0.11313628554344177, "signal/advantage_pre_scale_abs_mean": 0.11313628554344177, "signal/advantage_pre_scale_std": 0.20162217020988465, "signal/advantage_std": 0.20162217020988465, "signal/brier_reward/centered_abs_mean": 0.1082306370139122, "signal/brier_reward/group_std_mean": 0.14206129610538482, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0541153185069561, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0541153185069561, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.021256510727107526, "signal/format_reward/group_std_mean": 0.04359033033251762, "signal/format_reward/group_zero_std_frac": 0.8111111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.010628255363553763, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010628255363553763, "signal/mean_confidence_reward/centered_abs_mean": 0.08874420076608658, "signal/mean_confidence_reward/group_std_mean": 0.11713929325342179, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.874420132087835e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.874420132087835e-07, "step": 255 }, { "calibration/aurc": 0.14555074127111917, "calibration/batch_distribution_entropy": 0.6792731203085787, "calibration/confidence_entropy": 0.5445991816328819, "calibration/coverage@0%": 0.002657289648782953, "calibration/coverage@1%": 0.002657289648782953, "calibration/coverage@10%": 0.3722143782175488, "calibration/coverage@15%": 0.5508895903219658, "calibration/coverage@20%": 0.7799107393525928, "calibration/coverage@25%": 0.8539044523288133, "calibration/coverage@30%": 0.9365833333333334, "calibration/coverage@5%": 0.1902065772166155, "calibration/ece": 0.08235966401775505, "calibration/mean_confidence": 0.6525603283285002, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015885416666666673, "completions/max_length": 3592.6, "completions/max_terminated_length": 3592.6, "completions/mean_length": 776.5939331054688, "completions/mean_terminated_length": 789.0546142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 257.2, "epoch": 0.6239922000974988, "grad_norm": 0.000750742678064853, "learning_rate": 4.6875000000000004e-06, "loss": -0.0151, "num_tokens": 582685452.0, "reward": 1.2590891599655152, "reward_std": 0.15570541024208068, "rewards/accuracy_reward": 0.7094618082046509, "rewards/brier_reward": 0.8246756792068481, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9840277791023254, "rewards/mean_confidence_reward": 0.6495569825172425, "signal/accuracy_reward/centered_abs_mean": 0.14443901926279068, "signal/accuracy_reward/group_std_mean": 0.19196665585041045, "signal/accuracy_reward/group_zero_std_frac": 0.45277778506278993, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07221950963139534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07221950963139534, "signal/advantage_abs_mean": 0.1109919860959053, "signal/advantage_pre_scale_abs_mean": 0.1109919860959053, "signal/advantage_pre_scale_std": 0.2003971070051193, "signal/advantage_std": 0.2003971070051193, "signal/brier_reward/centered_abs_mean": 0.1043332725763321, "signal/brier_reward/group_std_mean": 0.13740936368703843, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.05216663628816605, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.05216663628816605, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.023828125, "signal/format_reward/group_std_mean": 0.042067190259695054, "signal/format_reward/group_zero_std_frac": 0.8361111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0119140625, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0119140625, "signal/mean_confidence_reward/centered_abs_mean": 0.08473697453737258, "signal/mean_confidence_reward/group_std_mean": 0.11159718781709671, "signal/mean_confidence_reward/group_zero_std_frac": 0.0, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.473697562294547e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.473697562294547e-07, "step": 260 }, { "calibration/aurc": 0.14333929467424694, "calibration/batch_distribution_entropy": 0.6432563988778293, "calibration/confidence_entropy": 0.512806401090686, "calibration/coverage@0%": 0.019976559375041704, "calibration/coverage@1%": 0.019976559375041704, "calibration/coverage@10%": 0.38973409191653713, "calibration/coverage@15%": 0.6500249230217667, "calibration/coverage@20%": 0.727064702833542, "calibration/coverage@25%": 0.7611309389198413, "calibration/coverage@30%": 0.9327572869180827, "calibration/coverage@5%": 0.22078882626273338, "calibration/ece": 0.12300491006973044, "calibration/mean_confidence": 0.7169319026761409, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01284722222222221, "completions/max_length": 3342.2, "completions/max_terminated_length": 3342.2, "completions/mean_length": 790.7150268554688, "completions/mean_terminated_length": 801.0620971679688, "completions/min_length": 0.0, "completions/min_terminated_length": 228.2, "epoch": 0.6359920500993738, "grad_norm": 0.0008038398809731007, "learning_rate": 4.657451923076923e-06, "loss": -0.014, "num_tokens": 594864313.0, "reward": 1.2632108926773071, "reward_std": 0.15936911702156067, "rewards/accuracy_reward": 0.7068576335906982, "rewards/brier_reward": 0.8327441096305848, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9868055582046509, "rewards/mean_confidence_reward": 0.7177876234054565, "signal/accuracy_reward/centered_abs_mean": 0.134228515625, "signal/accuracy_reward/group_std_mean": 0.18389686346054077, "signal/accuracy_reward/group_zero_std_frac": 0.4527777910232544, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0671142578125, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0671142578125, "signal/advantage_abs_mean": 0.11144563108682633, "signal/advantage_pre_scale_abs_mean": 0.11144563108682633, "signal/advantage_pre_scale_std": 0.20921348929405212, "signal/advantage_std": 0.20921348929405212, "signal/brier_reward/centered_abs_mean": 0.09292279034852982, "signal/brier_reward/group_std_mean": 0.12824206948280334, "signal/brier_reward/group_zero_std_frac": 0.01388888917863369, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04646139517426491, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04646139517426491, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02241753451526165, "signal/format_reward/group_std_mean": 0.0418248750269413, "signal/format_reward/group_zero_std_frac": 0.830555546283722, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.011208767257630824, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.011208767257630824, "signal/mean_confidence_reward/centered_abs_mean": 0.06767013072967529, "signal/mean_confidence_reward/group_std_mean": 0.09113913923501968, "signal/mean_confidence_reward/group_zero_std_frac": 0.01388888917863369, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.767012905584124e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.767012905584124e-07, "step": 265 }, { "calibration/aurc": 0.18302620801123665, "calibration/batch_distribution_entropy": 0.6001924638632085, "calibration/confidence_entropy": 0.5022208833165465, "calibration/coverage@0%": 0.01537470208215087, "calibration/coverage@1%": 0.01537470208215087, "calibration/coverage@10%": 0.3792761324327648, "calibration/coverage@15%": 0.4100855319105715, "calibration/coverage@20%": 0.505211827043216, "calibration/coverage@25%": 0.6328690052882674, "calibration/coverage@30%": 0.9273884857879244, "calibration/coverage@5%": 0.01537470208215087, "calibration/ece": 0.1286059248948004, "calibration/mean_confidence": 0.7432788017381937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009635416666666674, "completions/max_length": 3425.0, "completions/max_terminated_length": 3425.0, "completions/mean_length": 855.0973999023438, "completions/mean_terminated_length": 863.3589111328125, "completions/min_length": 0.0, "completions/min_terminated_length": 261.4, "epoch": 0.6479919001012487, "grad_norm": 0.0007353985565714538, "learning_rate": 4.627403846153847e-06, "loss": -0.0074, "num_tokens": 607835067.0, "reward": 1.26796498298645, "reward_std": 0.1529661923646927, "rewards/accuracy_reward": 0.7158854126930236, "rewards/brier_reward": 0.8299252033233643, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9901041626930237, "rewards/mean_confidence_reward": 0.752193284034729, "signal/accuracy_reward/centered_abs_mean": 0.14311523288488387, "signal/accuracy_reward/group_std_mean": 0.18656840920448303, "signal/accuracy_reward/group_zero_std_frac": 0.4833333432674408, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07155761644244193, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07155761644244193, "signal/advantage_abs_mean": 0.11355446428060531, "signal/advantage_pre_scale_abs_mean": 0.11355446428060531, "signal/advantage_pre_scale_std": 0.21068328022956848, "signal/advantage_std": 0.21068328022956848, "signal/brier_reward/centered_abs_mean": 0.08958265632390976, "signal/brier_reward/group_std_mean": 0.11962602138519288, "signal/brier_reward/group_zero_std_frac": 0.036111111752688885, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04479132816195488, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04479132816195488, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.016395399160683156, "signal/format_reward/group_std_mean": 0.032642266154289244, "signal/format_reward/group_zero_std_frac": 0.8583333373069764, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.008197699580341578, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.008197699580341578, "signal/mean_confidence_reward/centered_abs_mean": 0.05924979448318481, "signal/mean_confidence_reward/group_std_mean": 0.07978306859731674, "signal/mean_confidence_reward/group_zero_std_frac": 0.03888888955116272, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.924979177507339e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.924979177507339e-07, "step": 270 }, { "calibration/aurc": 0.24421176181442766, "calibration/batch_distribution_entropy": 0.566733262797074, "calibration/confidence_entropy": 0.48159664211224334, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.24511259791122714, "calibration/coverage@15%": 0.3621729653652954, "calibration/coverage@20%": 0.4201364118404912, "calibration/coverage@25%": 0.610098602656528, "calibration/coverage@30%": 0.7620052770448549, "calibration/coverage@5%": 0.12271540469973889, "calibration/ece": 0.17193173282359991, "calibration/mean_confidence": 0.784665477266052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006510416666666674, "completions/max_length": 3472.2, "completions/max_terminated_length": 3472.2, "completions/mean_length": 989.216748046875, "completions/mean_terminated_length": 995.6683349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 279.4, "epoch": 0.6599917501031237, "grad_norm": 0.000928328197915107, "learning_rate": 4.597355769230769e-06, "loss": -0.0051, "num_tokens": 622337756.0, "reward": 1.2550419092178344, "reward_std": 0.1517462909221649, "rewards/accuracy_reward": 0.6978298425674438, "rewards/brier_reward": 0.8190093636512756, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9932291746139527, "rewards/mean_confidence_reward": 0.7646481156349182, "signal/accuracy_reward/centered_abs_mean": 0.13515082448720933, "signal/accuracy_reward/group_std_mean": 0.1835828095674515, "signal/accuracy_reward/group_zero_std_frac": 0.45833333730697634, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06757541224360467, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06757541224360467, "signal/advantage_abs_mean": 0.10827735662460328, "signal/advantage_pre_scale_abs_mean": 0.10827735662460328, "signal/advantage_pre_scale_std": 0.20328060388565064, "signal/advantage_std": 0.20328060388565064, "signal/brier_reward/centered_abs_mean": 0.0844566598534584, "signal/brier_reward/group_std_mean": 0.11667095571756363, "signal/brier_reward/group_zero_std_frac": 0.002777777798473835, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0422283299267292, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0422283299267292, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.012174479104578495, "signal/format_reward/group_std_mean": 0.025952914357185365, "signal/format_reward/group_zero_std_frac": 0.8805555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006087239552289248, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006087239552289248, "signal/mean_confidence_reward/centered_abs_mean": 0.06158406212925911, "signal/mean_confidence_reward/group_std_mean": 0.08380927592515945, "signal/mean_confidence_reward/group_zero_std_frac": 0.00555555559694767, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.158405994938221e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.158405994938221e-07, "step": 275 }, { "calibration/aurc": 0.12399220936659239, "calibration/batch_distribution_entropy": 0.5716606692330453, "calibration/confidence_entropy": 0.4964877827596392, "calibration/coverage@0%": 0.038976604513335525, "calibration/coverage@1%": 0.10720577118000219, "calibration/coverage@10%": 0.4791847199444669, "calibration/coverage@15%": 0.6490695883655195, "calibration/coverage@20%": 0.7510690314175941, "calibration/coverage@25%": 0.8545381997944143, "calibration/coverage@30%": 0.8719636161644215, "calibration/coverage@5%": 0.47345555327780015, "calibration/ece": 0.13711991451506045, "calibration/mean_confidence": 0.7571594208983778, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010763888888888884, "completions/max_length": 3533.6, "completions/max_terminated_length": 3533.6, "completions/mean_length": 1014.65546875, "completions/mean_terminated_length": 1025.66494140625, "completions/min_length": 0.0, "completions/min_terminated_length": 258.6, "epoch": 0.6719916001049987, "grad_norm": 0.0007735313265584409, "learning_rate": 4.567307692307692e-06, "loss": -0.0112, "num_tokens": 637121883.0, "reward": 1.2405794620513917, "reward_std": 0.17485959231853485, "rewards/accuracy_reward": 0.6844617843627929, "rewards/brier_reward": 0.8080535769462586, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9886284589767456, "rewards/mean_confidence_reward": 0.7456082701683044, "signal/accuracy_reward/centered_abs_mean": 0.1605197474360466, "signal/accuracy_reward/group_std_mean": 0.21143212616443635, "signal/accuracy_reward/group_zero_std_frac": 0.4, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0802598737180233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0802598737180233, "signal/advantage_abs_mean": 0.12838667780160903, "signal/advantage_pre_scale_abs_mean": 0.12838667780160903, "signal/advantage_pre_scale_std": 0.22537092566490174, "signal/advantage_std": 0.22537092566490174, "signal/brier_reward/centered_abs_mean": 0.09329253137111664, "signal/brier_reward/group_std_mean": 0.12645308077335357, "signal/brier_reward/group_zero_std_frac": 0.022222222574055196, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04664626568555832, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04664626568555832, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.01971028670668602, "signal/format_reward/group_std_mean": 0.03896139487624169, "signal/format_reward/group_zero_std_frac": 0.8333333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00985514335334301, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00985514335334301, "signal/mean_confidence_reward/centered_abs_mean": 0.058378614485263824, "signal/mean_confidence_reward/group_std_mean": 0.07870236337184906, "signal/mean_confidence_reward/group_zero_std_frac": 0.03333333395421505, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.83786118113494e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.83786118113494e-07, "step": 280 }, { "calibration/aurc": 0.1380177601536657, "calibration/batch_distribution_entropy": 0.5624959589519792, "calibration/confidence_entropy": 0.5319099336487814, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.1253475407522405, "calibration/coverage@10%": 0.41286006633265115, "calibration/coverage@15%": 0.5369404845499525, "calibration/coverage@20%": 0.7469524600854679, "calibration/coverage@25%": 0.8193758584141587, "calibration/coverage@30%": 0.9526031188916949, "calibration/coverage@5%": 0.20210429750899728, "calibration/ece": 0.12902052698644187, "calibration/mean_confidence": 0.739752318715216, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02621527777777779, "completions/max_length": 3990.0, "completions/max_terminated_length": 3990.0, "completions/mean_length": 980.7057495117188, "completions/mean_terminated_length": 1007.0260131835937, "completions/min_length": 0.0, "completions/min_terminated_length": 227.4, "epoch": 0.6839914501068737, "grad_norm": 0.0008436363423243165, "learning_rate": 4.537259615384616e-06, "loss": -0.0279, "num_tokens": 651522685.0, "reward": 1.2463345050811767, "reward_std": 0.18331459462642669, "rewards/accuracy_reward": 0.7010416746139526, "rewards/brier_reward": 0.8178282022476197, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9737847208976745, "rewards/mean_confidence_reward": 0.7123567461967468, "signal/accuracy_reward/centered_abs_mean": 0.15362413227558136, "signal/accuracy_reward/group_std_mean": 0.20662194788455962, "signal/accuracy_reward/group_zero_std_frac": 0.39166666865348815, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07681206613779068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07681206613779068, "signal/advantage_abs_mean": 0.13278607577085494, "signal/advantage_pre_scale_abs_mean": 0.13278607577085494, "signal/advantage_pre_scale_std": 0.24074990451335906, "signal/advantage_std": 0.24074990451335906, "signal/brier_reward/centered_abs_mean": 0.0947231262922287, "signal/brier_reward/group_std_mean": 0.12886887341737746, "signal/brier_reward/group_zero_std_frac": 0.08333333395421505, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04736156314611435, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04736156314611435, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.03945312537252903, "signal/format_reward/group_std_mean": 0.06299598962068557, "signal/format_reward/group_zero_std_frac": 0.7722222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.019726562686264514, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.019726562686264514, "signal/mean_confidence_reward/centered_abs_mean": 0.0638624295592308, "signal/mean_confidence_reward/group_std_mean": 0.08576888144016266, "signal/mean_confidence_reward/group_zero_std_frac": 0.09166666679084301, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.386242489497817e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.386242489497817e-07, "step": 285 }, { "calibration/aurc": 0.17701762908847254, "calibration/batch_distribution_entropy": 0.5941155796417726, "calibration/confidence_entropy": 0.5110317332881141, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.4559482529942563, "calibration/coverage@20%": 0.6397517961216754, "calibration/coverage@25%": 0.8327237195529038, "calibration/coverage@30%": 0.9609012474550118, "calibration/coverage@5%": 0.0, "calibration/ece": 0.09082641381174666, "calibration/mean_confidence": 0.7427203120802487, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028038194444444442, "completions/max_length": 3895.8, "completions/max_terminated_length": 3895.8, "completions/mean_length": 978.0315185546875, "completions/mean_terminated_length": 1006.3482788085937, "completions/min_length": 0.0, "completions/min_terminated_length": 263.6, "epoch": 0.6959913001087487, "grad_norm": 0.000650497677270323, "learning_rate": 4.507211538461539e-06, "loss": -0.0293, "num_tokens": 665885512.0, "reward": 1.2346878528594971, "reward_std": 0.184012308716774, "rewards/accuracy_reward": 0.6897569537162781, "rewards/brier_reward": 0.8076424598693848, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9719618082046508, "rewards/mean_confidence_reward": 0.7166210293769837, "signal/accuracy_reward/centered_abs_mean": 0.1428493946790695, "signal/accuracy_reward/group_std_mean": 0.19786538183689117, "signal/accuracy_reward/group_zero_std_frac": 0.397222226858139, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07142469733953476, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07142469733953476, "signal/advantage_abs_mean": 0.12772089540958403, "signal/advantage_pre_scale_abs_mean": 0.12772089540958403, "signal/advantage_pre_scale_std": 0.24163770973682402, "signal/advantage_std": 0.24163770973682402, "signal/brier_reward/centered_abs_mean": 0.09354732930660248, "signal/brier_reward/group_std_mean": 0.1311555951833725, "signal/brier_reward/group_zero_std_frac": 0.15277777910232543, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04677366465330124, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04677366465330124, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.04412434920668602, "signal/format_reward/group_std_mean": 0.0740816980600357, "signal/format_reward/group_zero_std_frac": 0.7277777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.02206217460334301, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.02206217460334301, "signal/mean_confidence_reward/centered_abs_mean": 0.06570715606212615, "signal/mean_confidence_reward/group_std_mean": 0.09067816883325577, "signal/mean_confidence_reward/group_zero_std_frac": 0.17777777910232545, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.570715640918933e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.570715640918933e-07, "step": 290 }, { "calibration/aurc": 0.1475927794999495, "calibration/batch_distribution_entropy": 0.6050167835449746, "calibration/confidence_entropy": 0.5120933668759969, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.39416282642089095, "calibration/coverage@15%": 0.47200887523468166, "calibration/coverage@20%": 0.8427231895750096, "calibration/coverage@25%": 0.9335403369558968, "calibration/coverage@30%": 0.9684031970559485, "calibration/coverage@5%": 0.15714285714285714, "calibration/ece": 0.10218502486552054, "calibration/mean_confidence": 0.7273700707649587, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 3671.6, "completions/max_terminated_length": 3671.6, "completions/mean_length": 1014.1304809570313, "completions/mean_terminated_length": 1043.532421875, "completions/min_length": 0.0, "completions/min_terminated_length": 284.6, "epoch": 0.7079911501106236, "grad_norm": 0.0007778993458487093, "learning_rate": 4.477163461538462e-06, "loss": -0.0315, "num_tokens": 680653767.0, "reward": 1.2349740505218505, "reward_std": 0.18841009140014647, "rewards/accuracy_reward": 0.6837673544883728, "rewards/brier_reward": 0.8142914772033691, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.971875, "rewards/mean_confidence_reward": 0.7128588438034058, "signal/accuracy_reward/centered_abs_mean": 0.15353190004825593, "signal/accuracy_reward/group_std_mean": 0.20423202812671662, "signal/accuracy_reward/group_zero_std_frac": 0.4138888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07676595002412796, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07676595002412796, "signal/advantage_abs_mean": 0.13263954520225524, "signal/advantage_pre_scale_abs_mean": 0.13263954520225524, "signal/advantage_pre_scale_std": 0.2484306275844574, "signal/advantage_std": 0.2484306275844574, "signal/brier_reward/centered_abs_mean": 0.09720402956008911, "signal/brier_reward/group_std_mean": 0.13595170974731446, "signal/brier_reward/group_zero_std_frac": 0.2305555522441864, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.048602014780044556, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.048602014780044556, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.04541015625, "signal/format_reward/group_std_mean": 0.07853747233748436, "signal/format_reward/group_zero_std_frac": 0.7000000238418579, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.022705078125, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.022705078125, "signal/mean_confidence_reward/centered_abs_mean": 0.06733023822307586, "signal/mean_confidence_reward/group_std_mean": 0.09114226698875427, "signal/mean_confidence_reward/group_zero_std_frac": 0.2611111134290695, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.733023724336817e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.733023724336817e-07, "step": 295 }, { "calibration/aurc": 0.12745961109740883, "calibration/batch_distribution_entropy": 0.4489048960282684, "calibration/confidence_entropy": 0.48372576902773, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.6028079533138163, "calibration/coverage@15%": 0.6349236044229604, "calibration/coverage@20%": 0.8451893239822802, "calibration/coverage@25%": 0.9473373026288524, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.13527851458885942, "calibration/ece": 0.11861006943810426, "calibration/mean_confidence": 0.7870798179227735, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01605902777777779, "completions/max_length": 3831.6, "completions/max_terminated_length": 3831.6, "completions/mean_length": 981.0454223632812, "completions/mean_terminated_length": 997.0089233398437, "completions/min_length": 0.0, "completions/min_terminated_length": 235.8, "epoch": 0.7199910001124986, "grad_norm": 0.0006414313684217632, "learning_rate": 4.447115384615385e-06, "loss": -0.0171, "num_tokens": 695056722.0, "reward": 1.2570258855819703, "reward_std": 0.1558358281850815, "rewards/accuracy_reward": 0.7036458373069763, "rewards/brier_reward": 0.826449990272522, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9839409708976745, "rewards/mean_confidence_reward": 0.7460254788398742, "signal/accuracy_reward/centered_abs_mean": 0.12991536408662796, "signal/accuracy_reward/group_std_mean": 0.17473788559436798, "signal/accuracy_reward/group_zero_std_frac": 0.4888889014720917, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06495768204331398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06495768204331398, "signal/advantage_abs_mean": 0.1105946034193039, "signal/advantage_pre_scale_abs_mean": 0.1105946034193039, "signal/advantage_pre_scale_std": 0.21774430871009826, "signal/advantage_std": 0.21774430871009826, "signal/brier_reward/centered_abs_mean": 0.08356151729822159, "signal/brier_reward/group_std_mean": 0.1157326340675354, "signal/brier_reward/group_zero_std_frac": 0.24166666865348815, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.041780758649110794, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.041780758649110794, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02743598110973835, "signal/format_reward/group_std_mean": 0.04962395206093788, "signal/format_reward/group_zero_std_frac": 0.805555546283722, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.013717990554869174, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.013717990554869174, "signal/mean_confidence_reward/centered_abs_mean": 0.05799918100237846, "signal/mean_confidence_reward/group_std_mean": 0.07967415452003479, "signal/mean_confidence_reward/group_zero_std_frac": 0.2722222238779068, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.799918085358513e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.799918085358513e-07, "step": 300 }, { "epoch": 0.7199910001124986, "eval_calibration/aurc": 0.11797161462341547, "eval_calibration/batch_distribution_entropy": 0.5668926430185336, "eval_calibration/confidence_entropy": 0.5112455391776612, "eval_calibration/coverage@0%": 0.17673611111111112, "eval_calibration/coverage@1%": 0.17673611111111112, "eval_calibration/coverage@10%": 0.4760416666666667, "eval_calibration/coverage@15%": 0.6577284946236559, "eval_calibration/coverage@20%": 0.8069556451612904, "eval_calibration/coverage@25%": 0.953125, "eval_calibration/coverage@30%": 0.9739583333333334, "eval_calibration/coverage@5%": 0.35625, "eval_calibration/ece": 0.14525089605734767, "eval_calibration/mean_confidence": 0.7430230734767026, "eval_completions/clipped_ratio": 0.029166666666666674, "eval_completions/max_length": 2800.3333333333335, "eval_completions/max_terminated_length": 2800.3333333333335, "eval_completions/mean_length": 911.1606038411459, "eval_completions/mean_terminated_length": 938.0648295084635, "eval_completions/min_length": 0.0, "eval_completions/min_terminated_length": 275.3333333333333, "eval_loss": 0.0, "eval_num_tokens": 695056722.0, "eval_reward": 1.242617944876353, "eval_reward_std": 0.3668680141369502, "eval_rewards/accuracy_reward": 0.6935763855775198, "eval_rewards/brier_reward": 0.8168183863162994, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9748263855775198, "eval_rewards/mean_confidence_reward": 0.7271353900432587, "eval_runtime": 212.5164, "eval_samples_per_second": 4.706, "eval_signal/accuracy_reward/centered_abs_mean": 0.4080403596162796, "eval_signal/accuracy_reward/group_std_mean": 0.4572909673055013, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.2040201798081398, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2040201798081398, "eval_signal/advantage_abs_mean": 0.30681467056274414, "eval_signal/advantage_pre_scale_abs_mean": 0.30681467056274414, "eval_signal/advantage_pre_scale_std": 0.36492103338241577, "eval_signal/advantage_std": 0.36492103338241577, "eval_signal/brier_reward/centered_abs_mean": 0.18895844121774039, "eval_signal/brier_reward/group_std_mean": 0.242453766365846, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09447922060887019, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.09447922060887019, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.048122829757630825, "eval_signal/format_reward/group_std_mean": 0.12446699477732182, "eval_signal/format_reward/group_zero_std_frac": 0.361111119389534, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.024061414878815413, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.024061414878815413, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.13625056172410646, "eval_signal/mean_confidence_reward/group_std_mean": 0.18512510259946188, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.3625055809522262e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.3625055809522262e-06, "eval_steps_per_second": 0.028, "step": 300 }, { "epoch": 0.7199910001124986, "step": 300, "train_probe_calibration/aurc": 0.18947713575627176, "train_probe_calibration/batch_distribution_entropy": 0.511105087444719, "train_probe_calibration/confidence_entropy": 0.5016968812435342, "train_probe_calibration/coverage@0%": 0.0, "train_probe_calibration/coverage@1%": 0.0, "train_probe_calibration/coverage@10%": 0.13541666666666666, "train_probe_calibration/coverage@15%": 0.2604166666666667, "train_probe_calibration/coverage@20%": 0.6697657891484367, "train_probe_calibration/coverage@25%": 0.7356182795698926, "train_probe_calibration/coverage@30%": 0.9342069892473118, "train_probe_calibration/coverage@5%": 0.109375, "train_probe_calibration/ece": 0.13607638888888887, "train_probe_calibration/mean_confidence": 0.7676371122234583, "train_probe_completions/clipped_ratio": 0.030034722222222237, "train_probe_completions/max_length": 3148.0, "train_probe_completions/max_terminated_length": 3148.0, "train_probe_completions/mean_length": 924.5604349772135, "train_probe_completions/mean_terminated_length": 952.9794311523438, "train_probe_completions/min_length": 0.0, "train_probe_completions/min_terminated_length": 234.16666666666666, "train_probe_loss": 0.0, "train_probe_num_tokens": 695056722.0, "train_probe_reward": 1.259952962398529, "train_probe_reward_std": 0.3604995807011922, "train_probe_rewards/accuracy_reward": 0.7161458333333334, "train_probe_rewards/brier_reward": 0.8280508915583292, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9756944477558136, "train_probe_rewards/mean_confidence_reward": 0.7298292617003123, "train_probe_runtime": 215.2679, "train_probe_samples_per_second": 4.645, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3979492137829463, "train_probe_signal/accuracy_reward/group_std_mean": 0.45210810999075574, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19897460689147314, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.19897460689147314, "train_probe_signal/advantage_abs_mean": 0.29868050416310626, "train_probe_signal/advantage_pre_scale_abs_mean": 0.29868050416310626, "train_probe_signal/advantage_pre_scale_std": 0.3591413696606954, "train_probe_signal/advantage_std": 0.3591413696606954, "train_probe_signal/brier_reward/centered_abs_mean": 0.1797478993733724, "train_probe_signal/brier_reward/group_std_mean": 0.2347598448395729, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0898739496866862, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.0898739496866862, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.04633246548473835, "train_probe_signal/format_reward/group_std_mean": 0.11903019932409127, "train_probe_signal/format_reward/group_zero_std_frac": 0.3888889004786809, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.023166232742369175, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.023166232742369175, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.13617735976974168, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.18357371042172113, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.3617735514041367e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.3617735514041367e-06, "train_probe_steps_per_second": 0.028 }, { "calibration/aurc": 0.18370578182395753, "calibration/batch_distribution_entropy": 0.6835695268089997, "calibration/confidence_entropy": 0.5521079289674705, "calibration/coverage@0%": 0.10159574468085106, "calibration/coverage@1%": 0.10159574468085106, "calibration/coverage@10%": 0.33814308971778234, "calibration/coverage@15%": 0.41161190462824104, "calibration/coverage@20%": 0.5906564128733214, "calibration/coverage@25%": 0.7544129729359501, "calibration/coverage@30%": 0.8873211280234601, "calibration/coverage@5%": 0.10159574468085106, "calibration/ece": 0.1299207501789163, "calibration/mean_confidence": 0.6903119321582707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03315972222222226, "completions/max_length": 3897.0, "completions/max_terminated_length": 3897.0, "completions/mean_length": 934.0389770507812, "completions/mean_terminated_length": 966.2465087890625, "completions/min_length": 0.0, "completions/min_terminated_length": 222.6, "epoch": 0.7319908501143736, "grad_norm": 0.000515034596901387, "learning_rate": 4.4170673076923085e-06, "loss": -0.0302, "num_tokens": 708931635.0, "reward": 1.229620623588562, "reward_std": 0.18012266159057616, "rewards/accuracy_reward": 0.6775173664093017, "rewards/brier_reward": 0.8148699402809143, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9668402791023254, "rewards/mean_confidence_reward": 0.6795672535896301, "signal/accuracy_reward/centered_abs_mean": 0.1504069000482559, "signal/accuracy_reward/group_std_mean": 0.19902244508266448, "signal/accuracy_reward/group_zero_std_frac": 0.43888888955116273, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07520345002412795, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07520345002412795, "signal/advantage_abs_mean": 0.13035764694213867, "signal/advantage_pre_scale_abs_mean": 0.13035764694213867, "signal/advantage_pre_scale_std": 0.23779474198818207, "signal/advantage_std": 0.23779474198818207, "signal/brier_reward/centered_abs_mean": 0.09402061551809311, "signal/brier_reward/group_std_mean": 0.12866690903902053, "signal/brier_reward/group_zero_std_frac": 0.1027777785435319, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.047010307759046556, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.047010307759046556, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.04543185830116272, "signal/format_reward/group_std_mean": 0.07225132212042809, "signal/format_reward/group_zero_std_frac": 0.7472222328186036, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.02271592915058136, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.02271592915058136, "signal/mean_confidence_reward/centered_abs_mean": 0.06965743750333786, "signal/mean_confidence_reward/group_std_mean": 0.09262562543153763, "signal/mean_confidence_reward/group_zero_std_frac": 0.1138888904824853, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.965743637010747e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.965743637010747e-07, "step": 305 }, { "calibration/aurc": 0.14527279461312861, "calibration/batch_distribution_entropy": 0.7392085871677221, "calibration/confidence_entropy": 0.6015698222002837, "calibration/coverage@0%": 0.10147520541534476, "calibration/coverage@1%": 0.10147520541534476, "calibration/coverage@10%": 0.34397574226082234, "calibration/coverage@15%": 0.540763601857322, "calibration/coverage@20%": 0.6409721779145388, "calibration/coverage@25%": 0.8824611599377075, "calibration/coverage@30%": 0.9469502114944314, "calibration/coverage@5%": 0.19634019660651433, "calibration/ece": 0.13218766431631476, "calibration/mean_confidence": 0.6332353201443432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.023697916666666697, "completions/max_length": 3756.4, "completions/max_terminated_length": 3756.4, "completions/mean_length": 876.2470458984375, "completions/mean_terminated_length": 897.4653564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 228.6, "epoch": 0.7439907001162486, "grad_norm": 0.00042756725451909006, "learning_rate": 4.3870192307692315e-06, "loss": -0.0266, "num_tokens": 722111089.0, "reward": 1.236583662033081, "reward_std": 0.16719805002212523, "rewards/accuracy_reward": 0.6828125, "rewards/brier_reward": 0.8140400648117065, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9763020873069763, "rewards/mean_confidence_reward": 0.6290607333183289, "signal/accuracy_reward/centered_abs_mean": 0.14756944477558137, "signal/accuracy_reward/group_std_mean": 0.1985030800104141, "signal/accuracy_reward/group_zero_std_frac": 0.4166666746139526, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07378472238779069, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07378472238779069, "signal/advantage_abs_mean": 0.11503200978040695, "signal/advantage_pre_scale_abs_mean": 0.11503200978040695, "signal/advantage_pre_scale_std": 0.22053115367889403, "signal/advantage_std": 0.22053115367889403, "signal/brier_reward/centered_abs_mean": 0.07882244288921356, "signal/brier_reward/group_std_mean": 0.11283139735460282, "signal/brier_reward/group_zero_std_frac": 0.019444444589316844, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03941122144460678, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03941122144460678, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.03918728269636631, "signal/format_reward/group_std_mean": 0.07152397260069847, "signal/format_reward/group_zero_std_frac": 0.7138888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.019593641348183154, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.019593641348183154, "signal/mean_confidence_reward/centered_abs_mean": 0.06813013553619385, "signal/mean_confidence_reward/group_std_mean": 0.09392532706260681, "signal/mean_confidence_reward/group_zero_std_frac": 0.02222222238779068, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.813013442297233e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.813013442297233e-07, "step": 310 }, { "calibration/aurc": 0.11503333819091577, "calibration/batch_distribution_entropy": 0.7604709525675026, "calibration/confidence_entropy": 0.5787530145975468, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.06648648648648649, "calibration/coverage@10%": 0.496627372052904, "calibration/coverage@15%": 0.712770566919503, "calibration/coverage@20%": 0.798452160367054, "calibration/coverage@25%": 0.8735623921794134, "calibration/coverage@30%": 0.9237607820586543, "calibration/coverage@5%": 0.35940758995262445, "calibration/ece": 0.15914645176858838, "calibration/mean_confidence": 0.6443092411886028, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016753472222222232, "completions/max_length": 3364.4, "completions/max_terminated_length": 3364.4, "completions/mean_length": 863.78046875, "completions/mean_terminated_length": 878.540087890625, "completions/min_length": 0.0, "completions/min_terminated_length": 220.2, "epoch": 0.7559905501181235, "grad_norm": 0.0003748546587303281, "learning_rate": 4.356971153846154e-06, "loss": -0.0179, "num_tokens": 735174000.0, "reward": 1.2646051168441772, "reward_std": 0.15488864183425904, "rewards/accuracy_reward": 0.7111979246139526, "rewards/brier_reward": 0.8348393440246582, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9831597208976746, "rewards/mean_confidence_reward": 0.6619096159934997, "signal/accuracy_reward/centered_abs_mean": 0.13460828959941865, "signal/accuracy_reward/group_std_mean": 0.1837979555130005, "signal/accuracy_reward/group_zero_std_frac": 0.45277778506278993, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06730414479970932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06730414479970932, "signal/advantage_abs_mean": 0.10565023124217987, "signal/advantage_pre_scale_abs_mean": 0.10565023124217987, "signal/advantage_pre_scale_std": 0.20662780702114106, "signal/advantage_std": 0.20662780702114106, "signal/brier_reward/centered_abs_mean": 0.0752529188990593, "signal/brier_reward/group_std_mean": 0.10955016613006592, "signal/brier_reward/group_zero_std_frac": 0.04166666753590107, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03762645944952965, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03762645944952965, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02919921912252903, "signal/format_reward/group_std_mean": 0.05653143748641014, "signal/format_reward/group_zero_std_frac": 0.7611111164093017, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.014599609561264515, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.014599609561264515, "signal/mean_confidence_reward/centered_abs_mean": 0.06733770966529846, "signal/mean_confidence_reward/group_std_mean": 0.09314828664064408, "signal/mean_confidence_reward/group_zero_std_frac": 0.04722222294658422, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.733770987921161e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.733770987921161e-07, "step": 315 }, { "calibration/aurc": 0.15864956622751442, "calibration/batch_distribution_entropy": 0.6842765953975045, "calibration/confidence_entropy": 0.5445845409756707, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.38953177395651634, "calibration/coverage@15%": 0.5735679107545818, "calibration/coverage@20%": 0.747589055983511, "calibration/coverage@25%": 0.8357509275166682, "calibration/coverage@30%": 0.8683511624589929, "calibration/coverage@5%": 0.1287625979843225, "calibration/ece": 0.10916760422130392, "calibration/mean_confidence": 0.6883348675685248, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017013888888888905, "completions/max_length": 3758.2, "completions/max_terminated_length": 3758.2, "completions/mean_length": 815.5128662109375, "completions/mean_terminated_length": 829.7421020507812, "completions/min_length": 0.0, "completions/min_terminated_length": 198.4, "epoch": 0.7679904001199985, "grad_norm": 0.0004901195643469691, "learning_rate": 4.326923076923077e-06, "loss": -0.019, "num_tokens": 747661956.0, "reward": 1.245122456550598, "reward_std": 0.16071979105472564, "rewards/accuracy_reward": 0.6872395873069763, "rewards/brier_reward": 0.820786201953888, "rewards/confidence_one_or_zero": 0.0003472222248092294, "rewards/format_reward": 0.9822048544883728, "rewards/mean_confidence_reward": 0.7115432620048523, "signal/accuracy_reward/centered_abs_mean": 0.1409667983651161, "signal/accuracy_reward/group_std_mean": 0.18892463445663452, "signal/accuracy_reward/group_zero_std_frac": 0.44722222089767455, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07048339918255805, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07048339918255805, "signal/advantage_abs_mean": 0.1146804466843605, "signal/advantage_pre_scale_abs_mean": 0.1146804466843605, "signal/advantage_pre_scale_std": 0.215715628862381, "signal/advantage_std": 0.215715628862381, "signal/brier_reward/centered_abs_mean": 0.08633361905813217, "signal/brier_reward/group_std_mean": 0.11851300746202469, "signal/brier_reward/group_zero_std_frac": 0.16944444552063942, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04316680952906608, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04316680952906608, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006076388992369175, "signal/confidence_one_or_zero/group_std_mean": 0.0009333631955087185, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.076388103792851e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.076388103792851e-09, "signal/format_reward/centered_abs_mean": 0.028965928964316844, "signal/format_reward/group_std_mean": 0.05158567689359188, "signal/format_reward/group_zero_std_frac": 0.7972222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.014482964482158422, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.014482964482158422, "signal/mean_confidence_reward/centered_abs_mean": 0.0645561084151268, "signal/mean_confidence_reward/group_std_mean": 0.08717499077320098, "signal/mean_confidence_reward/group_zero_std_frac": 0.18611111789941787, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.455611355704604e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.455611355704604e-07, "step": 320 }, { "calibration/aurc": 0.17308852336390174, "calibration/batch_distribution_entropy": 0.6357202163738106, "calibration/confidence_entropy": 0.5205371314448661, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.2575687056737589, "calibration/coverage@15%": 0.2740580673758865, "calibration/coverage@20%": 0.7840123350556717, "calibration/coverage@25%": 0.865840451690293, "calibration/coverage@30%": 0.9321198734729494, "calibration/coverage@5%": 0.0, "calibration/ece": 0.10669371491652573, "calibration/mean_confidence": 0.712148178195782, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015538194444444441, "completions/max_length": 3812.2, "completions/max_terminated_length": 3812.2, "completions/mean_length": 840.22978515625, "completions/mean_terminated_length": 853.6203857421875, "completions/min_length": 0.0, "completions/min_terminated_length": 184.4, "epoch": 0.7799902501218735, "grad_norm": 0.0004444665100891143, "learning_rate": 4.296875e-06, "loss": -0.0142, "num_tokens": 760454267.0, "reward": 1.2520793437957765, "reward_std": 0.17104002833366394, "rewards/accuracy_reward": 0.6935763835906983, "rewards/brier_reward": 0.8261065006256103, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9844618201255798, "rewards/mean_confidence_reward": 0.6964852809906006, "signal/accuracy_reward/centered_abs_mean": 0.15925564169883727, "signal/accuracy_reward/group_std_mean": 0.21346534490585328, "signal/accuracy_reward/group_zero_std_frac": 0.37222222685813905, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07962782084941863, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07962782084941863, "signal/advantage_abs_mean": 0.12209683656692505, "signal/advantage_pre_scale_abs_mean": 0.12209683656692505, "signal/advantage_pre_scale_std": 0.21863058507442473, "signal/advantage_std": 0.21863058507442473, "signal/brier_reward/centered_abs_mean": 0.09202589392662049, "signal/brier_reward/group_std_mean": 0.12713525146245958, "signal/brier_reward/group_zero_std_frac": 0.15833333432674407, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04601294696331024, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04601294696331024, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.024159071687608957, "signal/format_reward/group_std_mean": 0.045959899201989174, "signal/format_reward/group_zero_std_frac": 0.8083333492279052, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012079535843804478, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012079535843804478, "signal/mean_confidence_reward/centered_abs_mean": 0.07450359761714935, "signal/mean_confidence_reward/group_std_mean": 0.0989140287041664, "signal/mean_confidence_reward/group_zero_std_frac": 0.18611111044883727, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.45035947602446e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.45035947602446e-07, "step": 325 }, { "calibration/aurc": 0.15548408476048575, "calibration/batch_distribution_entropy": 0.6983713300058271, "calibration/confidence_entropy": 0.5382532601767289, "calibration/coverage@0%": 0.0005221932114882506, "calibration/coverage@1%": 0.0005221932114882506, "calibration/coverage@10%": 0.3672300000065095, "calibration/coverage@15%": 0.4835141929464787, "calibration/coverage@20%": 0.7311968418289705, "calibration/coverage@25%": 0.9082671623155267, "calibration/coverage@30%": 0.9570680628272251, "calibration/coverage@5%": 0.19186456131625845, "calibration/ece": 0.127198297178523, "calibration/mean_confidence": 0.683886956867904, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007899305555555559, "completions/max_length": 3437.2, "completions/max_terminated_length": 3437.2, "completions/mean_length": 800.5682250976563, "completions/mean_terminated_length": 806.8061645507812, "completions/min_length": 0.0, "completions/min_terminated_length": 159.2, "epoch": 0.7919901001237485, "grad_norm": 0.0003628362319432199, "learning_rate": 4.266826923076923e-06, "loss": -0.0068, "num_tokens": 772781805.0, "reward": 1.2744685411453247, "reward_std": 0.13236619532108307, "rewards/accuracy_reward": 0.7107638835906982, "rewards/brier_reward": 0.8461465477943421, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9920138955116272, "rewards/mean_confidence_reward": 0.6363763451576233, "signal/accuracy_reward/centered_abs_mean": 0.14296875149011612, "signal/accuracy_reward/group_std_mean": 0.18968430459499358, "signal/accuracy_reward/group_zero_std_frac": 0.4611111104488373, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07148437574505806, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07148437574505806, "signal/advantage_abs_mean": 0.09383065700531006, "signal/advantage_pre_scale_abs_mean": 0.09383065700531006, "signal/advantage_pre_scale_std": 0.17452815175056458, "signal/advantage_std": 0.17452815175056458, "signal/brier_reward/centered_abs_mean": 0.07751072198152542, "signal/brier_reward/group_std_mean": 0.10652071684598922, "signal/brier_reward/group_zero_std_frac": 0.14166666865348815, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03875536099076271, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03875536099076271, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.014301215298473835, "signal/format_reward/group_std_mean": 0.029196897149086, "signal/format_reward/group_zero_std_frac": 0.8722222328186036, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0071506076492369175, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0071506076492369175, "signal/mean_confidence_reward/centered_abs_mean": 0.07131454646587372, "signal/mean_confidence_reward/group_std_mean": 0.09411249160766602, "signal/mean_confidence_reward/group_zero_std_frac": 0.15833333432674407, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.131454708542151e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.131454708542151e-07, "step": 330 }, { "calibration/aurc": 0.21602418573574952, "calibration/batch_distribution_entropy": 0.8587962532802937, "calibration/confidence_entropy": 0.5464265025585977, "calibration/coverage@0%": 0.05564304461942258, "calibration/coverage@1%": 0.05564304461942258, "calibration/coverage@10%": 0.144110249926331, "calibration/coverage@15%": 0.2464121488730358, "calibration/coverage@20%": 0.45554428748380993, "calibration/coverage@25%": 0.7023530496391933, "calibration/coverage@30%": 0.8305610236220472, "calibration/coverage@5%": 0.13308662787908693, "calibration/ece": 0.15214089378047863, "calibration/mean_confidence": 0.5449980582379619, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00703125, "completions/max_length": 3573.6, "completions/max_terminated_length": 3573.6, "completions/mean_length": 811.8894165039062, "completions/mean_terminated_length": 817.6563232421875, "completions/min_length": 0.0, "completions/min_terminated_length": 191.8, "epoch": 0.8039899501256235, "grad_norm": 0.00042503574513830245, "learning_rate": 4.236778846153847e-06, "loss": -0.0075, "num_tokens": 785259955.0, "reward": 1.2308667421340942, "reward_std": 0.13101654201745988, "rewards/accuracy_reward": 0.654774296283722, "rewards/brier_reward": 0.8139792680740356, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.99296875, "rewards/mean_confidence_reward": 0.5536199808120728, "signal/accuracy_reward/centered_abs_mean": 0.16446940302848817, "signal/accuracy_reward/group_std_mean": 0.2158215045928955, "signal/accuracy_reward/group_zero_std_frac": 0.3861111104488373, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.08223470151424409, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08223470151424409, "signal/advantage_abs_mean": 0.09200264662504196, "signal/advantage_pre_scale_abs_mean": 0.09200264662504196, "signal/advantage_pre_scale_std": 0.16565726399421693, "signal/advantage_std": 0.16565726399421693, "signal/brier_reward/centered_abs_mean": 0.1005157321691513, "signal/brier_reward/group_std_mean": 0.13379413187503814, "signal/brier_reward/group_zero_std_frac": 0.0722222238779068, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.05025786608457565, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.05025786608457565, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.012852647434920072, "signal/format_reward/group_std_mean": 0.02796985171735287, "signal/format_reward/group_zero_std_frac": 0.8722222208976745, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006426323717460036, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006426323717460036, "signal/mean_confidence_reward/centered_abs_mean": 0.083328278362751, "signal/mean_confidence_reward/group_std_mean": 0.10851679444313049, "signal/mean_confidence_reward/group_zero_std_frac": 0.07777777910232545, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.332827746926341e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.332827746926341e-07, "step": 335 }, { "calibration/aurc": 0.16874290381404566, "calibration/batch_distribution_entropy": 0.798382077741308, "calibration/confidence_entropy": 0.5238910411997163, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.42651534883220066, "calibration/coverage@15%": 0.5156626029374903, "calibration/coverage@20%": 0.6063015424506936, "calibration/coverage@25%": 0.7295703047201346, "calibration/coverage@30%": 0.7995863306932336, "calibration/coverage@5%": 0.1515625, "calibration/ece": 0.11768086873565471, "calibration/mean_confidence": 0.5767648732886708, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0059895833333333485, "completions/max_length": 3386.2, "completions/max_terminated_length": 3386.2, "completions/mean_length": 732.4586059570313, "completions/mean_terminated_length": 736.86943359375, "completions/min_length": 0.0, "completions/min_terminated_length": 161.2, "epoch": 0.8159898001274984, "grad_norm": 0.0005026932922191918, "learning_rate": 4.20673076923077e-06, "loss": -0.0066, "num_tokens": 796792150.0, "reward": 1.2638839483261108, "reward_std": 0.12966381758451462, "rewards/accuracy_reward": 0.6963541626930236, "rewards/brier_reward": 0.8374774932861329, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9939236044883728, "rewards/mean_confidence_reward": 0.6329264521598816, "signal/accuracy_reward/centered_abs_mean": 0.13806423544883728, "signal/accuracy_reward/group_std_mean": 0.18163847625255586, "signal/accuracy_reward/group_zero_std_frac": 0.4833333373069763, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06903211772441864, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06903211772441864, "signal/advantage_abs_mean": 0.09496441185474395, "signal/advantage_pre_scale_abs_mean": 0.09496441185474395, "signal/advantage_pre_scale_std": 0.1751703143119812, "signal/advantage_std": 0.1751703143119812, "signal/brier_reward/centered_abs_mean": 0.09618824124336242, "signal/brier_reward/group_std_mean": 0.1273261457681656, "signal/brier_reward/group_zero_std_frac": 0.11666666939854622, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04809412062168121, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04809412062168121, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.010818142723292113, "signal/format_reward/group_std_mean": 0.021273818612098695, "signal/format_reward/group_zero_std_frac": 0.9111111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0054090713616460565, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0054090713616460565, "signal/mean_confidence_reward/centered_abs_mean": 0.07558753341436386, "signal/mean_confidence_reward/group_std_mean": 0.09990677237510681, "signal/mean_confidence_reward/group_zero_std_frac": 0.13055555820465087, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.558753054581757e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.558753054581757e-07, "step": 340 }, { "calibration/aurc": 0.13170268351506, "calibration/batch_distribution_entropy": 0.7991337229523192, "calibration/confidence_entropy": 0.5425895937993038, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.08337730870712401, "calibration/coverage@10%": 0.39640442031586887, "calibration/coverage@15%": 0.6152621315927036, "calibration/coverage@20%": 0.7793854979986106, "calibration/coverage@25%": 0.9006780309761732, "calibration/coverage@30%": 0.9832898172323759, "calibration/coverage@5%": 0.21501108729104185, "calibration/ece": 0.1337062548972485, "calibration/mean_confidence": 0.6162851156108045, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005208333333333348, "completions/max_length": 3557.6, "completions/max_terminated_length": 3557.6, "completions/mean_length": 797.3438354492188, "completions/mean_terminated_length": 801.5601196289062, "completions/min_length": 0.0, "completions/min_terminated_length": 196.4, "epoch": 0.8279896501293734, "grad_norm": 0.0004626760783139616, "learning_rate": 4.176682692307693e-06, "loss": -0.0047, "num_tokens": 809068783.0, "reward": 1.2527307748794556, "reward_std": 0.1347155749797821, "rewards/accuracy_reward": 0.6798611164093018, "rewards/brier_reward": 0.8308826088905334, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947048544883728, "rewards/mean_confidence_reward": 0.6401055932044983, "signal/accuracy_reward/centered_abs_mean": 0.14573567658662795, "signal/accuracy_reward/group_std_mean": 0.1936684489250183, "signal/accuracy_reward/group_zero_std_frac": 0.44166666865348814, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07286783829331397, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07286783829331397, "signal/advantage_abs_mean": 0.09781409353017807, "signal/advantage_pre_scale_abs_mean": 0.09781409353017807, "signal/advantage_pre_scale_std": 0.1775255173444748, "signal/advantage_std": 0.1775255173444748, "signal/brier_reward/centered_abs_mean": 0.08667407780885697, "signal/brier_reward/group_std_mean": 0.11506727337837219, "signal/brier_reward/group_zero_std_frac": 0.10277777835726738, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.043337038904428485, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.043337038904428485, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.009564887173473835, "signal/format_reward/group_std_mean": 0.019964820891618728, "signal/format_reward/group_zero_std_frac": 0.9111111283302307, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004782443586736918, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004782443586736918, "signal/mean_confidence_reward/centered_abs_mean": 0.07610666304826737, "signal/mean_confidence_reward/group_std_mean": 0.09956182688474655, "signal/mean_confidence_reward/group_zero_std_frac": 0.1166666679084301, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.610665988977416e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.610665988977416e-07, "step": 345 }, { "calibration/aurc": 0.2155120638319415, "calibration/batch_distribution_entropy": 0.7631033929937923, "calibration/confidence_entropy": 0.5453056459474718, "calibration/coverage@0%": 0.05759162303664922, "calibration/coverage@1%": 0.14128727521056228, "calibration/coverage@10%": 0.20365353972228545, "calibration/coverage@15%": 0.3280421266788072, "calibration/coverage@20%": 0.42127568005918514, "calibration/coverage@25%": 0.5272094325049587, "calibration/coverage@30%": 0.7996709513724947, "calibration/coverage@5%": 0.1440046665149101, "calibration/ece": 0.12469606653309691, "calibration/mean_confidence": 0.6416474297283733, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007204861111111116, "completions/max_length": 3666.2, "completions/max_terminated_length": 3666.2, "completions/mean_length": 748.625537109375, "completions/mean_terminated_length": 753.95048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 202.8, "epoch": 0.8399895001312484, "grad_norm": 0.00044094465556554496, "learning_rate": 4.146634615384616e-06, "loss": -0.0075, "num_tokens": 820784949.0, "reward": 1.2554822444915772, "reward_std": 0.1254580497741699, "rewards/accuracy_reward": 0.6856770873069763, "rewards/brier_reward": 0.8324791312217712, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9927951455116272, "rewards/mean_confidence_reward": 0.6561909317970276, "signal/accuracy_reward/centered_abs_mean": 0.13560655564069748, "signal/accuracy_reward/group_std_mean": 0.17929703295230864, "signal/accuracy_reward/group_zero_std_frac": 0.4916666686534882, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06780327782034874, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06780327782034874, "signal/advantage_abs_mean": 0.09052754342555999, "signal/advantage_pre_scale_abs_mean": 0.09052754342555999, "signal/advantage_pre_scale_std": 0.17190001308918, "signal/advantage_std": 0.17190001308918, "signal/brier_reward/centered_abs_mean": 0.07575866132974625, "signal/brier_reward/group_std_mean": 0.10197795778512955, "signal/brier_reward/group_zero_std_frac": 0.10277777910232544, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.037879330664873125, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.037879330664873125, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.012255859281867742, "signal/format_reward/group_std_mean": 0.02544722966849804, "signal/format_reward/group_zero_std_frac": 0.8861111164093017, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006127929640933871, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006127929640933871, "signal/mean_confidence_reward/centered_abs_mean": 0.06658856421709061, "signal/mean_confidence_reward/group_std_mean": 0.08765096962451935, "signal/mean_confidence_reward/group_zero_std_frac": 0.10833333730697632, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.658856818830828e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.658856818830828e-07, "step": 350 }, { "epoch": 0.8399895001312484, "eval_calibration/aurc": 0.11466538148682363, "eval_calibration/batch_distribution_entropy": 0.7077095444744549, "eval_calibration/confidence_entropy": 0.5498132072537738, "eval_calibration/coverage@0%": 0.21354166666666666, "eval_calibration/coverage@1%": 0.21354166666666666, "eval_calibration/coverage@10%": 0.5625, "eval_calibration/coverage@15%": 0.59375, "eval_calibration/coverage@20%": 0.7708333333333334, "eval_calibration/coverage@25%": 0.9479166666666666, "eval_calibration/coverage@30%": 0.984375, "eval_calibration/coverage@5%": 0.4479166666666667, "eval_calibration/ece": 0.1683572093878923, "eval_calibration/mean_confidence": 0.6741731816101145, "eval_completions/clipped_ratio": 0.006944444444444438, "eval_completions/max_length": 2159.5, "eval_completions/max_terminated_length": 2159.5, "eval_completions/mean_length": 749.7206827799479, "eval_completions/mean_terminated_length": 755.0474548339844, "eval_completions/min_length": 68.33333333333333, "eval_completions/min_terminated_length": 218.33333333333334, "eval_loss": 0.0, "eval_num_tokens": 820784949.0, "eval_reward": 1.2729398608207703, "eval_reward_std": 0.2971045523881912, "eval_rewards/accuracy_reward": 0.7083333333333334, "eval_rewards/brier_reward": 0.8444774548212687, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9930555522441864, "eval_rewards/mean_confidence_reward": 0.6642921964327494, "eval_runtime": 173.4735, "eval_samples_per_second": 5.765, "eval_signal/accuracy_reward/centered_abs_mean": 0.3975694427887599, "eval_signal/accuracy_reward/group_std_mean": 0.45154231290022534, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19878472139437994, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.19878472139437994, "eval_signal/advantage_abs_mean": 0.2512461915612221, "eval_signal/advantage_pre_scale_abs_mean": 0.2512461915612221, "eval_signal/advantage_pre_scale_std": 0.2956870098908742, "eval_signal/advantage_std": 0.2956870098908742, "eval_signal/brier_reward/centered_abs_mean": 0.13479836036761603, "eval_signal/brier_reward/group_std_mean": 0.18237248063087463, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.06739918018380801, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.06739918018380801, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.013237847170482079, "eval_signal/format_reward/group_std_mean": 0.03330489216993252, "eval_signal/format_reward/group_zero_std_frac": 0.8333333532015482, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.006618923585241039, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.006618923585241039, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.15164848168691, "eval_signal/mean_confidence_reward/group_std_mean": 0.19345779716968536, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.5164847203171423e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.5164847203171423e-06, "eval_steps_per_second": 0.035, "step": 350 }, { "epoch": 0.8399895001312484, "step": 350, "train_probe_calibration/aurc": 0.1376924358765411, "train_probe_calibration/batch_distribution_entropy": 0.7290346718241966, "train_probe_calibration/confidence_entropy": 0.553157512968658, "train_probe_calibration/coverage@0%": 0.21320564516129034, "train_probe_calibration/coverage@1%": 0.21320564516129034, "train_probe_calibration/coverage@10%": 0.4321236559139785, "train_probe_calibration/coverage@15%": 0.6384408602150539, "train_probe_calibration/coverage@20%": 0.8005712365591399, "train_probe_calibration/coverage@25%": 0.900369623655914, "train_probe_calibration/coverage@30%": 0.96875, "train_probe_calibration/coverage@5%": 0.21320564516129034, "train_probe_calibration/ece": 0.1592880861117193, "train_probe_calibration/mean_confidence": 0.6652275834039502, "train_probe_completions/clipped_ratio": 0.008680555555555561, "train_probe_completions/max_length": 2512.1666666666665, "train_probe_completions/max_terminated_length": 2512.1666666666665, "train_probe_completions/mean_length": 760.2239786783854, "train_probe_completions/mean_terminated_length": 766.9147338867188, "train_probe_completions/min_length": 36.0, "train_probe_completions/min_terminated_length": 202.66666666666666, "train_probe_loss": 0.0, "train_probe_num_tokens": 820784949.0, "train_probe_reward": 1.289116342862447, "train_probe_reward_std": 0.2903154144684474, "train_probe_rewards/accuracy_reward": 0.7361111144224802, "train_probe_rewards/brier_reward": 0.8507888118426005, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9913194477558136, "train_probe_rewards/mean_confidence_reward": 0.6609991987546285, "train_probe_runtime": 207.2881, "train_probe_samples_per_second": 4.824, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3772786508003871, "train_probe_signal/accuracy_reward/group_std_mean": 0.43917299310366315, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.18863932540019354, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.18863932540019354, "train_probe_signal/advantage_abs_mean": 0.23864888151486716, "train_probe_signal/advantage_pre_scale_abs_mean": 0.23864888151486716, "train_probe_signal/advantage_pre_scale_std": 0.28990621864795685, "train_probe_signal/advantage_std": 0.28990621864795685, "train_probe_signal/brier_reward/centered_abs_mean": 0.12855459998051325, "train_probe_signal/brier_reward/group_std_mean": 0.17542100449403128, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.06427729999025662, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.06427729999025662, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.016493055503815413, "train_probe_signal/format_reward/group_std_mean": 0.0425994840140144, "train_probe_signal/format_reward/group_zero_std_frac": 0.7777778009573618, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.008246527751907706, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.008246527751907706, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.15471457441647848, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.196126955250899, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.547145719390149e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.547145719390149e-06, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.2423558288000447, "calibration/batch_distribution_entropy": 0.7655905297064465, "calibration/confidence_entropy": 0.5708759872994722, "calibration/coverage@0%": 0.012010443864229765, "calibration/coverage@1%": 0.012010443864229765, "calibration/coverage@10%": 0.21044386422976502, "calibration/coverage@15%": 0.2950391644908616, "calibration/coverage@20%": 0.3279373368146214, "calibration/coverage@25%": 0.5608598999129677, "calibration/coverage@30%": 0.5779265665796345, "calibration/coverage@5%": 0.20783289817232378, "calibration/ece": 0.1378207402154509, "calibration/mean_confidence": 0.6397090572619596, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00616319444444442, "completions/max_length": 3543.4, "completions/max_terminated_length": 3543.4, "completions/mean_length": 780.4750122070312, "completions/mean_terminated_length": 785.2974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 202.0, "epoch": 0.8519893501331234, "grad_norm": 0.0004276749968994409, "learning_rate": 4.116586538461539e-06, "loss": -0.0066, "num_tokens": 832874613.0, "reward": 1.2605106830596924, "reward_std": 0.1331548124551773, "rewards/accuracy_reward": 0.6957465291023255, "rewards/brier_reward": 0.8317722797393798, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895753860473, "rewards/mean_confidence_reward": 0.6480299472808838, "signal/accuracy_reward/centered_abs_mean": 0.14163953959941863, "signal/accuracy_reward/group_std_mean": 0.1877330332994461, "signal/accuracy_reward/group_zero_std_frac": 0.4638888895511627, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.07081976979970932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07081976979970932, "signal/advantage_abs_mean": 0.09516175389289856, "signal/advantage_pre_scale_abs_mean": 0.09516175389289856, "signal/advantage_pre_scale_std": 0.17578432559967042, "signal/advantage_std": 0.17578432559967042, "signal/brier_reward/centered_abs_mean": 0.07110370546579362, "signal/brier_reward/group_std_mean": 0.09785515367984772, "signal/brier_reward/group_zero_std_frac": 0.0444444453343749, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03555185273289681, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03555185273289681, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01194118931889534, "signal/format_reward/group_std_mean": 0.025462664663791656, "signal/format_reward/group_zero_std_frac": 0.8861111164093017, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00597059465944767, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00597059465944767, "signal/mean_confidence_reward/centered_abs_mean": 0.06324325874447823, "signal/mean_confidence_reward/group_std_mean": 0.08281998485326766, "signal/mean_confidence_reward/group_zero_std_frac": 0.0444444453343749, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.324325795503682e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.324325795503682e-07, "step": 355 }, { "calibration/aurc": 0.10494547164413107, "calibration/batch_distribution_entropy": 0.7331405035106856, "calibration/confidence_entropy": 0.5994114788788631, "calibration/coverage@0%": 0.10962521758050478, "calibration/coverage@1%": 0.17100087895616617, "calibration/coverage@10%": 0.5539412506734636, "calibration/coverage@15%": 0.6726227818876318, "calibration/coverage@20%": 0.8074723361929628, "calibration/coverage@25%": 0.9103015734869521, "calibration/coverage@30%": 0.9815229253871551, "calibration/coverage@5%": 0.4772210048074931, "calibration/ece": 0.18747137779418793, "calibration/mean_confidence": 0.6113017506396201, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010243055555555557, "completions/max_length": 2930.4, "completions/max_terminated_length": 2930.4, "completions/mean_length": 764.0307373046875, "completions/mean_terminated_length": 771.9576782226562, "completions/min_length": 0.0, "completions/min_terminated_length": 215.8, "epoch": 0.8639892001349984, "grad_norm": 0.0005043584387749434, "learning_rate": 4.086538461538462e-06, "loss": -0.0114, "num_tokens": 844764567.0, "reward": 1.2541569471359253, "reward_std": 0.1254362493753433, "rewards/accuracy_reward": 0.6883680582046509, "rewards/brier_reward": 0.8303507208824158, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833253860473, "rewards/mean_confidence_reward": 0.5880884289741516, "signal/accuracy_reward/centered_abs_mean": 0.1400607630610466, "signal/accuracy_reward/group_std_mean": 0.18564009070396423, "signal/accuracy_reward/group_zero_std_frac": 0.4611111104488373, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0700303815305233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0700303815305233, "signal/advantage_abs_mean": 0.08988471031188965, "signal/advantage_pre_scale_abs_mean": 0.08988471031188965, "signal/advantage_pre_scale_std": 0.1728453516960144, "signal/advantage_std": 0.1728453516960144, "signal/brier_reward/centered_abs_mean": 0.06908701285719872, "signal/brier_reward/group_std_mean": 0.09380456060171127, "signal/brier_reward/group_zero_std_frac": 0.05000000093132258, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03454350642859936, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03454350642859936, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01763237863779068, "signal/format_reward/group_std_mean": 0.03316636718809605, "signal/format_reward/group_zero_std_frac": 0.8638888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00881618931889534, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00881618931889534, "signal/mean_confidence_reward/centered_abs_mean": 0.06198589876294136, "signal/mean_confidence_reward/group_std_mean": 0.08041212558746338, "signal/mean_confidence_reward/group_zero_std_frac": 0.052777778916060926, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.198589971972979e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.198589971972979e-07, "step": 360 }, { "calibration/aurc": 0.1287328575382296, "calibration/batch_distribution_entropy": 0.7098329270944501, "calibration/confidence_entropy": 0.5534849731767872, "calibration/coverage@0%": 0.14732179870367826, "calibration/coverage@1%": 0.3507704497045486, "calibration/coverage@10%": 0.5486036001370083, "calibration/coverage@15%": 0.6500323678572347, "calibration/coverage@20%": 0.6874549480544296, "calibration/coverage@25%": 0.7263134000463383, "calibration/coverage@30%": 0.8830196794947763, "calibration/coverage@5%": 0.4283672587146718, "calibration/ece": 0.18332637448061406, "calibration/mean_confidence": 0.6548396114971501, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0050347222222222095, "completions/max_length": 3383.4, "completions/max_terminated_length": 3383.4, "completions/mean_length": 781.502880859375, "completions/mean_terminated_length": 785.4791015625, "completions/min_length": 0.0, "completions/min_terminated_length": 183.6, "epoch": 0.8759890501368733, "grad_norm": 0.0005086059682071209, "learning_rate": 4.0564903846153846e-06, "loss": -0.0058, "num_tokens": 856865176.0, "reward": 1.2718287467956544, "reward_std": 0.13091017454862594, "rewards/accuracy_reward": 0.7092013835906983, "rewards/brier_reward": 0.8394775271415711, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9949652671813964, "rewards/mean_confidence_reward": 0.6617013573646545, "signal/accuracy_reward/centered_abs_mean": 0.13508029729127885, "signal/accuracy_reward/group_std_mean": 0.18672677874565125, "signal/accuracy_reward/group_zero_std_frac": 0.43888888955116273, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06754014864563943, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06754014864563943, "signal/advantage_abs_mean": 0.09080385863780975, "signal/advantage_pre_scale_abs_mean": 0.09080385863780975, "signal/advantage_pre_scale_std": 0.17359325289726257, "signal/advantage_std": 0.17359325289726257, "signal/brier_reward/centered_abs_mean": 0.07011737525463105, "signal/brier_reward/group_std_mean": 0.0971314176917076, "signal/brier_reward/group_zero_std_frac": 0.047222222201526165, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.035058687627315524, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.035058687627315524, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.009244791697710752, "signal/format_reward/group_std_mean": 0.020097243785858154, "signal/format_reward/group_zero_std_frac": 0.9083333492279053, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004622395848855376, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004622395848855376, "signal/mean_confidence_reward/centered_abs_mean": 0.056587792187929156, "signal/mean_confidence_reward/group_std_mean": 0.07685085088014602, "signal/mean_confidence_reward/group_zero_std_frac": 0.06111111138015986, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.65877917324542e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.65877917324542e-07, "step": 365 }, { "calibration/aurc": 0.12614879223130504, "calibration/batch_distribution_entropy": 0.5303573137044758, "calibration/confidence_entropy": 0.5416012709225069, "calibration/coverage@0%": 0.19289093598852886, "calibration/coverage@1%": 0.19289093598852886, "calibration/coverage@10%": 0.4706836535955519, "calibration/coverage@15%": 0.5712315284403922, "calibration/coverage@20%": 0.9032786885245901, "calibration/coverage@25%": 0.9251366120218579, "calibration/coverage@30%": 0.9415300546448087, "calibration/coverage@5%": 0.3714669434127843, "calibration/ece": 0.15389461734032422, "calibration/mean_confidence": 0.7209384855327899, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007291666666666652, "completions/max_length": 3598.6, "completions/max_terminated_length": 3598.6, "completions/mean_length": 742.0171020507812, "completions/mean_terminated_length": 747.5368530273438, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.8879889001387483, "grad_norm": 0.0005085735465399921, "learning_rate": 4.026442307692308e-06, "loss": -0.0054, "num_tokens": 868484477.0, "reward": 1.304303812980652, "reward_std": 0.12730777710676194, "rewards/accuracy_reward": 0.7599826335906983, "rewards/brier_reward": 0.8559027194976807, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9927083373069763, "rewards/mean_confidence_reward": 0.699118721485138, "signal/accuracy_reward/centered_abs_mean": 0.12821723371744156, "signal/accuracy_reward/group_std_mean": 0.17075916528701782, "signal/accuracy_reward/group_zero_std_frac": 0.5083333432674408, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06410861685872078, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06410861685872078, "signal/advantage_abs_mean": 0.09198119938373565, "signal/advantage_pre_scale_abs_mean": 0.09198119938373565, "signal/advantage_pre_scale_std": 0.18210922479629515, "signal/advantage_std": 0.18210922479629515, "signal/brier_reward/centered_abs_mean": 0.07101348340511322, "signal/brier_reward/group_std_mean": 0.09601679742336273, "signal/brier_reward/group_zero_std_frac": 0.03055555634200573, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03550674170255661, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03550674170255661, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.011990017350763082, "signal/format_reward/group_std_mean": 0.02386271096765995, "signal/format_reward/group_zero_std_frac": 0.8972222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005995008675381541, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005995008675381541, "signal/mean_confidence_reward/centered_abs_mean": 0.056539113819599154, "signal/mean_confidence_reward/group_std_mean": 0.07439476102590561, "signal/mean_confidence_reward/group_zero_std_frac": 0.03888888955116272, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.653911102854181e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.653911102854181e-07, "step": 370 }, { "calibration/aurc": 0.14005854285736122, "calibration/batch_distribution_entropy": 0.5845706781854919, "calibration/confidence_entropy": 0.537635153420035, "calibration/coverage@0%": 0.04746246736292428, "calibration/coverage@1%": 0.04746246736292428, "calibration/coverage@10%": 0.47146703655352484, "calibration/coverage@15%": 0.6251128698868582, "calibration/coverage@20%": 0.6787587032201915, "calibration/coverage@25%": 0.8279849325500435, "calibration/coverage@30%": 0.8650293733681462, "calibration/coverage@5%": 0.30948786988685817, "calibration/ece": 0.13588787560008456, "calibration/mean_confidence": 0.7202132341886431, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0050347222222222095, "completions/max_length": 3205.6, "completions/max_terminated_length": 3205.6, "completions/mean_length": 772.2670288085938, "completions/mean_terminated_length": 776.1182495117188, "completions/min_length": 0.0, "completions/min_terminated_length": 195.6, "epoch": 0.8999887501406233, "grad_norm": 0.00037806283216923475, "learning_rate": 3.996394230769231e-06, "loss": -0.0041, "num_tokens": 880484385.0, "reward": 1.2805837392807007, "reward_std": 0.12322836518287658, "rewards/accuracy_reward": 0.7142361044883728, "rewards/brier_reward": 0.8520391702651977, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9948784589767456, "rewards/mean_confidence_reward": 0.6796824932098389, "signal/accuracy_reward/centered_abs_mean": 0.12817925214767456, "signal/accuracy_reward/group_std_mean": 0.17277657091617585, "signal/accuracy_reward/group_zero_std_frac": 0.5000000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06408962607383728, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06408962607383728, "signal/advantage_abs_mean": 0.0883659228682518, "signal/advantage_pre_scale_abs_mean": 0.0883659228682518, "signal/advantage_pre_scale_std": 0.17259128093719484, "signal/advantage_std": 0.17259128093719484, "signal/brier_reward/centered_abs_mean": 0.07010068967938424, "signal/brier_reward/group_std_mean": 0.095072703063488, "signal/brier_reward/group_zero_std_frac": 0.03055555634200573, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03505034483969212, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03505034483969212, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.009271918330341578, "signal/format_reward/group_std_mean": 0.019891217723488806, "signal/format_reward/group_zero_std_frac": 0.9083333492279053, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004635959165170789, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004635959165170789, "signal/mean_confidence_reward/centered_abs_mean": 0.0554458886384964, "signal/mean_confidence_reward/group_std_mean": 0.07442165762186051, "signal/mean_confidence_reward/group_zero_std_frac": 0.03055555634200573, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.544588930206373e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.544588930206373e-07, "step": 375 }, { "calibration/aurc": 0.15258077860033226, "calibration/batch_distribution_entropy": 0.6162681970661794, "calibration/confidence_entropy": 0.5365360145009496, "calibration/coverage@0%": 0.09017543859649121, "calibration/coverage@1%": 0.09017543859649121, "calibration/coverage@10%": 0.26707926801337545, "calibration/coverage@15%": 0.4935476644450552, "calibration/coverage@20%": 0.7103523218130181, "calibration/coverage@25%": 0.8477982456140352, "calibration/coverage@30%": 0.9752631578947369, "calibration/coverage@5%": 0.16584721840502037, "calibration/ece": 0.10143870423934506, "calibration/mean_confidence": 0.7101023182924022, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005729166666666674, "completions/max_length": 3573.6, "completions/max_terminated_length": 3573.6, "completions/mean_length": 772.63212890625, "completions/mean_terminated_length": 777.0876708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.9119886001424983, "grad_norm": 0.0003984541108366102, "learning_rate": 3.966346153846154e-06, "loss": -0.0042, "num_tokens": 892516147.0, "reward": 1.2883473873138427, "reward_std": 0.1191188246011734, "rewards/accuracy_reward": 0.7342881917953491, "rewards/brier_reward": 0.8481219172477722, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9942708253860474, "rewards/mean_confidence_reward": 0.6875454425811768, "signal/accuracy_reward/centered_abs_mean": 0.12490776926279068, "signal/accuracy_reward/group_std_mean": 0.1661813288927078, "signal/accuracy_reward/group_zero_std_frac": 0.5166666686534882, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06245388463139534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06245388463139534, "signal/advantage_abs_mean": 0.08785299956798553, "signal/advantage_pre_scale_abs_mean": 0.08785299956798553, "signal/advantage_pre_scale_std": 0.1721292644739151, "signal/advantage_std": 0.1721292644739151, "signal/brier_reward/centered_abs_mean": 0.06520998179912567, "signal/brier_reward/group_std_mean": 0.08714500814676285, "signal/brier_reward/group_zero_std_frac": 0.04722222257405519, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.032604990899562834, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.032604990899562834, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.009450954757630825, "signal/format_reward/group_std_mean": 0.016871258057653903, "signal/format_reward/group_zero_std_frac": 0.9333333492279052, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004725477378815413, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004725477378815413, "signal/mean_confidence_reward/centered_abs_mean": 0.05295726507902145, "signal/mean_confidence_reward/group_std_mean": 0.06964650005102158, "signal/mean_confidence_reward/group_zero_std_frac": 0.05277777817100286, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.295726452914096e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.295726452914096e-07, "step": 380 }, { "calibration/aurc": 0.17686581782623784, "calibration/batch_distribution_entropy": 0.52090132258773, "calibration/confidence_entropy": 0.500673771850931, "calibration/coverage@0%": 0.018354500891265595, "calibration/coverage@1%": 0.018354500891265595, "calibration/coverage@10%": 0.28048406862745096, "calibration/coverage@15%": 0.6600345394583345, "calibration/coverage@20%": 0.7331284530708999, "calibration/coverage@25%": 0.7581857174688057, "calibration/coverage@30%": 0.8390625, "calibration/coverage@5%": 0.018354500891265595, "calibration/ece": 0.13017879662326218, "calibration/mean_confidence": 0.7627044306681302, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00616319444444442, "completions/max_length": 3599.8, "completions/max_terminated_length": 3599.8, "completions/mean_length": 813.7527099609375, "completions/mean_terminated_length": 818.8523071289062, "completions/min_length": 44.0, "completions/min_terminated_length": 218.4, "epoch": 0.9239884501443731, "grad_norm": 0.0005658803856931627, "learning_rate": 3.936298076923077e-06, "loss": -0.0062, "num_tokens": 904994226.0, "reward": 1.2709124326705932, "reward_std": 0.13287843018770218, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.8331289529800415, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9938368082046509, "rewards/mean_confidence_reward": 0.7607030987739563, "signal/accuracy_reward/centered_abs_mean": 0.12478841245174407, "signal/accuracy_reward/group_std_mean": 0.1681414306163788, "signal/accuracy_reward/group_zero_std_frac": 0.5055555641651154, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06239420622587204, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06239420622587204, "signal/advantage_abs_mean": 0.09677838385105134, "signal/advantage_pre_scale_abs_mean": 0.09677838385105134, "signal/advantage_pre_scale_std": 0.19231151938438415, "signal/advantage_std": 0.19231151938438415, "signal/brier_reward/centered_abs_mean": 0.07141073867678642, "signal/brier_reward/group_std_mean": 0.09745891690254212, "signal/brier_reward/group_zero_std_frac": 0.18055555373430252, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03570536933839321, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03570536933839321, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.00974934883415699, "signal/format_reward/group_std_mean": 0.017312010750174522, "signal/format_reward/group_zero_std_frac": 0.9305555701255799, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004874674417078495, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004874674417078495, "signal/mean_confidence_reward/centered_abs_mean": 0.0422736831009388, "signal/mean_confidence_reward/group_std_mean": 0.05735933929681778, "signal/mean_confidence_reward/group_zero_std_frac": 0.20277777910232545, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.22736832206283e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.22736832206283e-07, "step": 385 }, { "calibration/aurc": 0.15260965337571247, "calibration/batch_distribution_entropy": 0.6444420471528992, "calibration/confidence_entropy": 0.4504811823747761, "calibration/coverage@0%": 0.05691263089005235, "calibration/coverage@1%": 0.05691263089005235, "calibration/coverage@10%": 0.41369477043548, "calibration/coverage@15%": 0.5801295031190337, "calibration/coverage@20%": 0.6484376423956183, "calibration/coverage@25%": 0.7372763961605584, "calibration/coverage@30%": 0.943008289703316, "calibration/coverage@5%": 0.26351796491599794, "calibration/ece": 0.13345170204058618, "calibration/mean_confidence": 0.7817843796420971, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666666666674, "completions/max_length": 3569.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 824.2099853515625, "completions/mean_terminated_length": 827.619384765625, "completions/min_length": 0.0, "completions/min_terminated_length": 201.6, "epoch": 0.9359883001462481, "grad_norm": 0.000490221194922924, "learning_rate": 3.90625e-06, "loss": -0.0032, "num_tokens": 917598373.0, "reward": 1.2687837839126588, "reward_std": 0.13714092522859572, "rewards/accuracy_reward": 0.7135416626930237, "rewards/brier_reward": 0.8284369111061096, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9955729246139526, "rewards/mean_confidence_reward": 0.7958133697509766, "signal/accuracy_reward/centered_abs_mean": 0.12468532621860504, "signal/accuracy_reward/group_std_mean": 0.16752224266529084, "signal/accuracy_reward/group_zero_std_frac": 0.5055555760860443, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06234266310930252, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06234266310930252, "signal/advantage_abs_mean": 0.1010126069188118, "signal/advantage_pre_scale_abs_mean": 0.1010126069188118, "signal/advantage_pre_scale_std": 0.19631874859333037, "signal/advantage_std": 0.19631874859333037, "signal/brier_reward/centered_abs_mean": 0.08110525757074356, "signal/brier_reward/group_std_mean": 0.10810945332050323, "signal/brier_reward/group_zero_std_frac": 0.10833333432674408, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04055262878537178, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04055262878537178, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.00785047747194767, "signal/format_reward/group_std_mean": 0.01563947144895792, "signal/format_reward/group_zero_std_frac": 0.9333333492279052, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.003925238735973835, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.003925238735973835, "signal/mean_confidence_reward/centered_abs_mean": 0.04991179332137108, "signal/mean_confidence_reward/group_std_mean": 0.06657638847827911, "signal/mean_confidence_reward/group_zero_std_frac": 0.1250000014901161, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.991179480384745e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.991179480384745e-07, "step": 390 }, { "calibration/aurc": 0.1354932838566389, "calibration/batch_distribution_entropy": 0.6116846111174865, "calibration/confidence_entropy": 0.37487506492702877, "calibration/coverage@0%": 0.08854166666666666, "calibration/coverage@1%": 0.18592386561954624, "calibration/coverage@10%": 0.4943872994539814, "calibration/coverage@15%": 0.5543450760074673, "calibration/coverage@20%": 0.6805339255394525, "calibration/coverage@25%": 0.7902247093297486, "calibration/coverage@30%": 0.847767454954955, "calibration/coverage@5%": 0.3277932422600583, "calibration/ece": 0.16145373707372607, "calibration/mean_confidence": 0.8324234758345312, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009027777777777768, "completions/max_length": 3838.4, "completions/max_terminated_length": 3838.4, "completions/mean_length": 906.5050537109375, "completions/mean_terminated_length": 914.87265625, "completions/min_length": 0.0, "completions/min_terminated_length": 237.4, "epoch": 0.9479881501481231, "grad_norm": 0.000490121659822762, "learning_rate": 3.876201923076923e-06, "loss": -0.0108, "num_tokens": 931175359.0, "reward": 1.2568167924880982, "reward_std": 0.14161943942308425, "rewards/accuracy_reward": 0.6959201335906983, "rewards/brier_reward": 0.826724898815155, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9909722208976746, "rewards/mean_confidence_reward": 0.8181571006774903, "signal/accuracy_reward/centered_abs_mean": 0.1205132395029068, "signal/accuracy_reward/group_std_mean": 0.1588496595621109, "signal/accuracy_reward/group_zero_std_frac": 0.547222226858139, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0602566197514534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0602566197514534, "signal/advantage_abs_mean": 0.10578400492668152, "signal/advantage_pre_scale_abs_mean": 0.10578400492668152, "signal/advantage_pre_scale_std": 0.20728633999824525, "signal/advantage_std": 0.20728633999824525, "signal/brier_reward/centered_abs_mean": 0.08919335603713989, "signal/brier_reward/group_std_mean": 0.11672022640705108, "signal/brier_reward/group_zero_std_frac": 0.13888888955116271, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.044596678018569945, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.044596678018569945, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01488715298473835, "signal/format_reward/group_std_mean": 0.025769182294607163, "signal/format_reward/group_zero_std_frac": 0.9027777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007443576492369175, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007443576492369175, "signal/mean_confidence_reward/centered_abs_mean": 0.05335674062371254, "signal/mean_confidence_reward/group_std_mean": 0.07230115979909897, "signal/mean_confidence_reward/group_zero_std_frac": 0.15000000298023225, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.335673449735623e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.335673449735623e-07, "step": 395 }, { "calibration/aurc": 0.1542307199234949, "calibration/batch_distribution_entropy": 0.6188817937175487, "calibration/confidence_entropy": 0.3709801365820278, "calibration/coverage@0%": 0.11204191352071277, "calibration/coverage@1%": 0.2437179932777588, "calibration/coverage@10%": 0.5072775980653803, "calibration/coverage@15%": 0.5976812146014643, "calibration/coverage@20%": 0.6560180014504766, "calibration/coverage@25%": 0.7286190789473684, "calibration/coverage@30%": 0.7530541666666666, "calibration/coverage@5%": 0.34256616081702584, "calibration/ece": 0.15680901144560183, "calibration/mean_confidence": 0.8273161933731916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010069444444444464, "completions/max_length": 3729.2, "completions/max_terminated_length": 3729.2, "completions/mean_length": 896.7393310546875, "completions/mean_terminated_length": 905.817578125, "completions/min_length": 0.0, "completions/min_terminated_length": 238.0, "epoch": 0.9599880001499981, "grad_norm": 0.00048337329644709826, "learning_rate": 3.846153846153847e-06, "loss": -0.0116, "num_tokens": 944576996.0, "reward": 1.2653816938400269, "reward_std": 0.14297903776168824, "rewards/accuracy_reward": 0.7138020753860473, "rewards/brier_reward": 0.8270145058631897, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9899305582046509, "rewards/mean_confidence_reward": 0.8113420128822326, "signal/accuracy_reward/centered_abs_mean": 0.12461480051279068, "signal/accuracy_reward/group_std_mean": 0.16390288770198821, "signal/accuracy_reward/group_zero_std_frac": 0.5361111283302307, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06230740025639534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06230740025639534, "signal/advantage_abs_mean": 0.10457079857587814, "signal/advantage_pre_scale_abs_mean": 0.10457079857587814, "signal/advantage_pre_scale_std": 0.20576355755329132, "signal/advantage_std": 0.20576355755329132, "signal/brier_reward/centered_abs_mean": 0.0869052067399025, "signal/brier_reward/group_std_mean": 0.11690500974655152, "signal/brier_reward/group_zero_std_frac": 0.1722222238779068, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04345260336995125, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04345260336995125, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01733940951526165, "signal/format_reward/group_std_mean": 0.03186274319887161, "signal/format_reward/group_zero_std_frac": 0.8750000119209289, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.008669704757630825, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.008669704757630825, "signal/mean_confidence_reward/centered_abs_mean": 0.05921415910124779, "signal/mean_confidence_reward/group_std_mean": 0.0786533772945404, "signal/mean_confidence_reward/group_zero_std_frac": 0.18055555671453477, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.921415549892117e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.921415549892117e-07, "step": 400 }, { "epoch": 0.9599880001499981, "eval_calibration/aurc": 0.11659332067535128, "eval_calibration/batch_distribution_entropy": 0.6489400512442423, "eval_calibration/confidence_entropy": 0.3923535694413262, "eval_calibration/coverage@0%": 0.28125, "eval_calibration/coverage@1%": 0.28125, "eval_calibration/coverage@10%": 0.4895833333333333, "eval_calibration/coverage@15%": 0.6197916666666666, "eval_calibration/coverage@20%": 0.8333333333333334, "eval_calibration/coverage@25%": 0.9114583333333334, "eval_calibration/coverage@30%": 0.96875, "eval_calibration/coverage@5%": 0.328125, "eval_calibration/ece": 0.16380208333333332, "eval_calibration/mean_confidence": 0.8039062499999999, "eval_completions/clipped_ratio": 0.0034722222222222285, "eval_completions/max_length": 2852.0, "eval_completions/max_terminated_length": 2852.0, "eval_completions/mean_length": 910.8159891764323, "eval_completions/mean_terminated_length": 913.9169413248698, "eval_completions/min_length": 175.16666666666666, "eval_completions/min_terminated_length": 294.5, "eval_loss": 0.0, "eval_num_tokens": 944576996.0, "eval_reward": 1.2716324130694072, "eval_reward_std": 0.33539749681949615, "eval_rewards/accuracy_reward": 0.7100694477558136, "eval_rewards/brier_reward": 0.8375198046366373, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9956597288449606, "eval_rewards/mean_confidence_reward": 0.7895139157772064, "eval_runtime": 191.4278, "eval_samples_per_second": 5.224, "eval_signal/accuracy_reward/centered_abs_mean": 0.3949652761220932, "eval_signal/accuracy_reward/group_std_mean": 0.4495016684134801, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1974826380610466, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.1974826380610466, "eval_signal/advantage_abs_mean": 0.28470087548096973, "eval_signal/advantage_pre_scale_abs_mean": 0.28470087548096973, "eval_signal/advantage_pre_scale_std": 0.3334394842386246, "eval_signal/advantage_std": 0.3334394842386246, "eval_signal/brier_reward/centered_abs_mean": 0.1923588365316391, "eval_signal/brier_reward/group_std_mean": 0.25036589801311493, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09617941826581955, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.09617941826581955, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.008300781094779571, "eval_signal/format_reward/group_std_mean": 0.021562910017867882, "eval_signal/format_reward/group_zero_std_frac": 0.8888889153798422, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.004150390547389786, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.004150390547389786, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.1580235535899798, "eval_signal/mean_confidence_reward/group_std_mean": 0.2027751257022222, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.5802355240642403e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.5802355240642403e-06, "eval_steps_per_second": 0.031, "step": 400 }, { "epoch": 0.9599880001499981, "step": 400, "train_probe_calibration/aurc": 0.12334070207739271, "train_probe_calibration/batch_distribution_entropy": 0.6584033159789989, "train_probe_calibration/confidence_entropy": 0.40050344937590854, "train_probe_calibration/coverage@0%": 0.109375, "train_probe_calibration/coverage@1%": 0.109375, "train_probe_calibration/coverage@10%": 0.5571236559139785, "train_probe_calibration/coverage@15%": 0.734375, "train_probe_calibration/coverage@20%": 0.8697916666666666, "train_probe_calibration/coverage@25%": 0.9322916666666666, "train_probe_calibration/coverage@30%": 0.953125, "train_probe_calibration/coverage@5%": 0.24915994623655915, "train_probe_calibration/ece": 0.13872143817204297, "train_probe_calibration/mean_confidence": 0.7896118951612903, "train_probe_completions/clipped_ratio": 0.006944444444444438, "train_probe_completions/max_length": 3242.0, "train_probe_completions/max_terminated_length": 3242.0, "train_probe_completions/mean_length": 920.172597249349, "train_probe_completions/mean_terminated_length": 926.5815022786459, "train_probe_completions/min_length": 111.66666666666667, "train_probe_completions/min_terminated_length": 282.5, "train_probe_loss": 0.0, "train_probe_num_tokens": 944576996.0, "train_probe_reward": 1.2973501880963643, "train_probe_reward_std": 0.3245939314365387, "train_probe_rewards/accuracy_reward": 0.7439236144224802, "train_probe_rewards/brier_reward": 0.8577053844928741, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9930555522441864, "train_probe_rewards/mean_confidence_reward": 0.7861284911632538, "train_probe_runtime": 189.7581, "train_probe_samples_per_second": 5.27, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3713650157054265, "train_probe_signal/accuracy_reward/group_std_mean": 0.43572990596294403, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.18568250785271326, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.18568250785271326, "train_probe_signal/advantage_abs_mean": 0.26594458520412445, "train_probe_signal/advantage_pre_scale_abs_mean": 0.26594458520412445, "train_probe_signal/advantage_pre_scale_std": 0.3234584480524063, "train_probe_signal/advantage_std": 0.3234584480524063, "train_probe_signal/brier_reward/centered_abs_mean": 0.1733687644203504, "train_probe_signal/brier_reward/group_std_mean": 0.23685835053523382, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0866843822101752, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.0866843822101752, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.013346354011446238, "train_probe_signal/format_reward/group_std_mean": 0.0362943010404706, "train_probe_signal/format_reward/group_zero_std_frac": 0.8055555721124014, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.006673177005723119, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.006673177005723119, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.16220596184333166, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.21322756757338843, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.6220595663677766e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.6220595663677766e-06, "train_probe_steps_per_second": 0.032 }, { "calibration/aurc": 0.15334668529398848, "calibration/batch_distribution_entropy": 0.6159223058376259, "calibration/confidence_entropy": 0.3720421741526594, "calibration/coverage@0%": 0.06774193548387096, "calibration/coverage@1%": 0.06774193548387096, "calibration/coverage@10%": 0.27521280162560324, "calibration/coverage@15%": 0.48130669714672764, "calibration/coverage@20%": 0.7307567013800694, "calibration/coverage@25%": 0.8907340053763442, "calibration/coverage@30%": 0.9690451612903226, "calibration/coverage@5%": 0.10880860215053763, "calibration/ece": 0.13828358139657948, "calibration/mean_confidence": 0.8144861035369569, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00659722222222221, "completions/max_length": 3727.6, "completions/max_terminated_length": 3727.6, "completions/mean_length": 906.83671875, "completions/mean_terminated_length": 912.8620361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 266.4, "epoch": 0.9719878501518731, "grad_norm": 0.00045347586274147034, "learning_rate": 3.81610576923077e-06, "loss": -0.0068, "num_tokens": 958110347.0, "reward": 1.2766327857971191, "reward_std": 0.12398091852664947, "rewards/accuracy_reward": 0.7133680582046509, "rewards/brier_reward": 0.8464791417121887, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934027791023254, "rewards/mean_confidence_reward": 0.7757734537124634, "signal/accuracy_reward/centered_abs_mean": 0.11477864533662796, "signal/accuracy_reward/group_std_mean": 0.15435773730278016, "signal/accuracy_reward/group_zero_std_frac": 0.5527777791023254, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05738932266831398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05738932266831398, "signal/advantage_abs_mean": 0.08912380784749985, "signal/advantage_pre_scale_abs_mean": 0.08912380784749985, "signal/advantage_pre_scale_std": 0.18138374090194703, "signal/advantage_std": 0.18138374090194703, "signal/brier_reward/centered_abs_mean": 0.07356481403112411, "signal/brier_reward/group_std_mean": 0.10099510848522186, "signal/brier_reward/group_zero_std_frac": 0.10000000223517418, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036782407015562055, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036782407015562055, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01123046870343387, "signal/format_reward/group_std_mean": 0.02265222743153572, "signal/format_reward/group_zero_std_frac": 0.9, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005615234351716935, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005615234351716935, "signal/mean_confidence_reward/centered_abs_mean": 0.061173788458108905, "signal/mean_confidence_reward/group_std_mean": 0.08097953125834464, "signal/mean_confidence_reward/group_zero_std_frac": 0.11388888880610466, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.117378291037312e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.117378291037312e-07, "step": 405 }, { "calibration/aurc": 0.15765953734811217, "calibration/batch_distribution_entropy": 0.8336808470933745, "calibration/confidence_entropy": 0.4969874366544989, "calibration/coverage@0%": 0.05990882718607542, "calibration/coverage@1%": 0.05990882718607542, "calibration/coverage@10%": 0.3835232387115123, "calibration/coverage@15%": 0.4795281504584069, "calibration/coverage@20%": 0.6502217714993525, "calibration/coverage@25%": 0.7406914928006371, "calibration/coverage@30%": 0.8544621331924516, "calibration/coverage@5%": 0.16029842692360824, "calibration/ece": 0.10699348671000608, "calibration/mean_confidence": 0.6873180675902613, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007638888888888906, "completions/max_length": 3861.4, "completions/max_terminated_length": 3861.4, "completions/mean_length": 920.8962646484375, "completions/mean_terminated_length": 928.0929077148437, "completions/min_length": 0.0, "completions/min_terminated_length": 255.8, "epoch": 0.983987700153748, "grad_norm": 0.00035713831312023103, "learning_rate": 3.7860576923076927e-06, "loss": -0.0063, "num_tokens": 971815840.0, "reward": 1.2550601482391357, "reward_std": 0.11442221105098724, "rewards/accuracy_reward": 0.6866319417953491, "rewards/brier_reward": 0.8310266256332397, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9924479126930237, "rewards/mean_confidence_reward": 0.6924869656562805, "signal/accuracy_reward/centered_abs_mean": 0.12317708432674408, "signal/accuracy_reward/group_std_mean": 0.16123688519001006, "signal/accuracy_reward/group_zero_std_frac": 0.544444465637207, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06158854216337204, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06158854216337204, "signal/advantage_abs_mean": 0.08303250819444656, "signal/advantage_pre_scale_abs_mean": 0.08303250819444656, "signal/advantage_pre_scale_std": 0.16405427753925322, "signal/advantage_std": 0.16405427753925322, "signal/brier_reward/centered_abs_mean": 0.06928967908024788, "signal/brier_reward/group_std_mean": 0.09232426434755325, "signal/brier_reward/group_zero_std_frac": 0.08055555745959282, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03464483954012394, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03464483954012394, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.00990125872194767, "signal/format_reward/group_std_mean": 0.02039713803678751, "signal/format_reward/group_zero_std_frac": 0.9111111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004950629360973835, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004950629360973835, "signal/mean_confidence_reward/centered_abs_mean": 0.06385140419006348, "signal/mean_confidence_reward/group_std_mean": 0.08166046142578125, "signal/mean_confidence_reward/group_zero_std_frac": 0.08055555745959282, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.385140636666619e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.385140636666619e-07, "step": 410 }, { "calibration/aurc": 0.12707729646882523, "calibration/batch_distribution_entropy": 0.8658722768822102, "calibration/confidence_entropy": 0.5143584605937814, "calibration/coverage@0%": 0.22675303321451118, "calibration/coverage@1%": 0.22675303321451118, "calibration/coverage@10%": 0.4874988976458357, "calibration/coverage@15%": 0.5965369834020274, "calibration/coverage@20%": 0.8295668212283243, "calibration/coverage@25%": 0.9200797872340425, "calibration/coverage@30%": 0.9654255319148936, "calibration/coverage@5%": 0.26063719551817605, "calibration/ece": 0.15707801128346519, "calibration/mean_confidence": 0.6493427213295087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005989583333333326, "completions/max_length": 3493.8, "completions/max_terminated_length": 3493.8, "completions/mean_length": 902.0040893554688, "completions/mean_terminated_length": 907.4773559570312, "completions/min_length": 0.0, "completions/min_terminated_length": 256.6, "epoch": 0.995987550155623, "grad_norm": 0.0004414472496137023, "learning_rate": 3.756009615384616e-06, "loss": -0.006, "num_tokens": 985349199.0, "reward": 1.2856523990631104, "reward_std": 0.11382195055484771, "rewards/accuracy_reward": 0.7251736044883728, "rewards/brier_reward": 0.852107048034668, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9940104246139526, "rewards/mean_confidence_reward": 0.6859262228012085, "signal/accuracy_reward/centered_abs_mean": 0.12038845419883729, "signal/accuracy_reward/group_std_mean": 0.16099080741405486, "signal/accuracy_reward/group_zero_std_frac": 0.5305555760860443, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06019422709941864, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06019422709941864, "signal/advantage_abs_mean": 0.08231369405984879, "signal/advantage_pre_scale_abs_mean": 0.08231369405984879, "signal/advantage_pre_scale_std": 0.15995562374591826, "signal/advantage_std": 0.15995562374591826, "signal/brier_reward/centered_abs_mean": 0.06809909045696258, "signal/brier_reward/group_std_mean": 0.09161102324724198, "signal/brier_reward/group_zero_std_frac": 0.05277777910232544, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03404954522848129, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03404954522848129, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.010020616371184588, "signal/format_reward/group_std_mean": 0.017517024464905263, "signal/format_reward/group_zero_std_frac": 0.9333333253860474, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005010308185592294, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005010308185592294, "signal/mean_confidence_reward/centered_abs_mean": 0.06621891856193543, "signal/mean_confidence_reward/group_std_mean": 0.08555808812379836, "signal/mean_confidence_reward/group_zero_std_frac": 0.05555555671453476, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.621891770919319e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.621891770919319e-07, "step": 415 }, { "calibration/aurc": 0.18565116283078092, "calibration/batch_distribution_entropy": 0.7579450784089301, "calibration/confidence_entropy": 0.4334492655445832, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.2763076366117366, "calibration/coverage@15%": 0.44605441119152767, "calibration/coverage@20%": 0.686447938397687, "calibration/coverage@25%": 0.7367602059513338, "calibration/coverage@30%": 0.7675803117714397, "calibration/coverage@5%": 0.15505057888984036, "calibration/ece": 0.14551149173415018, "calibration/mean_confidence": 0.7268827879992423, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004513888888888906, "completions/max_length": 2889.2, "completions/max_terminated_length": 2889.2, "completions/mean_length": 817.30078125, "completions/mean_terminated_length": 821.3833862304688, "completions/min_length": 82.6, "completions/min_terminated_length": 258.0, "epoch": 1.0095998800015, "grad_norm": 0.0005732954014092684, "learning_rate": 3.725961538461539e-06, "loss": -0.0058, "num_tokens": 998670725.0, "reward": 1.2844402074813843, "reward_std": 0.1270935669541359, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.8596643567085266, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9943576455116272, "rewards/mean_confidence_reward": 0.7316796779632568, "signal/accuracy_reward/centered_abs_mean": 0.12907443642616273, "signal/accuracy_reward/group_std_mean": 0.1728029191493988, "signal/accuracy_reward/group_zero_std_frac": 0.5027777910232544, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06453721821308137, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06453721821308137, "signal/advantage_abs_mean": 0.09220675975084305, "signal/advantage_pre_scale_abs_mean": 0.09220675975084305, "signal/advantage_pre_scale_std": 0.17704798877239228, "signal/advantage_std": 0.17704798877239228, "signal/brier_reward/centered_abs_mean": 0.07588545978069305, "signal/brier_reward/group_std_mean": 0.10389434546232224, "signal/brier_reward/group_zero_std_frac": 0.11944444552063942, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03794272989034653, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03794272989034653, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.009488932136446238, "signal/format_reward/group_std_mean": 0.01661727111786604, "signal/format_reward/group_zero_std_frac": 0.9361111283302307, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004744466068223119, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004744466068223119, "signal/mean_confidence_reward/centered_abs_mean": 0.06570395305752755, "signal/mean_confidence_reward/group_std_mean": 0.08515858501195908, "signal/mean_confidence_reward/group_zero_std_frac": 0.13055555745959282, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.570395157723397e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.570395157723397e-07, "step": 420 }, { "calibration/aurc": 0.18756609504410343, "calibration/batch_distribution_entropy": 0.630013250162695, "calibration/confidence_entropy": 0.3702480144932891, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.09842931937172775, "calibration/coverage@15%": 0.4269718019652874, "calibration/coverage@20%": 0.6641688752976519, "calibration/coverage@25%": 0.7901085216492902, "calibration/coverage@30%": 0.9100711601117917, "calibration/coverage@5%": 0.0, "calibration/ece": 0.1421922903459847, "calibration/mean_confidence": 0.8093211539513074, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 3906.4, "completions/max_terminated_length": 3906.4, "completions/mean_length": 931.5106689453125, "completions/mean_terminated_length": 937.3971801757813, "completions/min_length": 0.0, "completions/min_terminated_length": 274.6, "epoch": 1.021599730003375, "grad_norm": 0.000559783133212477, "learning_rate": 3.695913461538462e-06, "loss": -0.0064, "num_tokens": 1012522944.0, "reward": 1.2773274183273315, "reward_std": 0.14319739788770675, "rewards/accuracy_reward": 0.7174479246139527, "rewards/brier_reward": 0.8434412240982055, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99375, "rewards/mean_confidence_reward": 0.7824678897857666, "signal/accuracy_reward/centered_abs_mean": 0.1313747838139534, "signal/accuracy_reward/group_std_mean": 0.17641664743423463, "signal/accuracy_reward/group_zero_std_frac": 0.48055556416511536, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0656873919069767, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0656873919069767, "signal/advantage_abs_mean": 0.10497878938913345, "signal/advantage_pre_scale_abs_mean": 0.10497878938913345, "signal/advantage_pre_scale_std": 0.20087648928165436, "signal/advantage_std": 0.20087648928165436, "signal/brier_reward/centered_abs_mean": 0.09107427299022675, "signal/brier_reward/group_std_mean": 0.12208644300699234, "signal/brier_reward/group_zero_std_frac": 0.17222222238779067, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04553713649511337, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04553713649511337, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.010828992910683155, "signal/format_reward/group_std_mean": 0.020968519151210785, "signal/format_reward/group_zero_std_frac": 0.9111111283302307, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005414496455341577, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005414496455341577, "signal/mean_confidence_reward/centered_abs_mean": 0.06407147422432899, "signal/mean_confidence_reward/group_std_mean": 0.08539035767316819, "signal/mean_confidence_reward/group_zero_std_frac": 0.19444444626569748, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.407146997844392e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.407146997844392e-07, "step": 425 }, { "calibration/aurc": 0.08335348568650214, "calibration/batch_distribution_entropy": 0.7495570490127685, "calibration/confidence_entropy": 0.4137658959601561, "calibration/coverage@0%": 0.28505461156753653, "calibration/coverage@1%": 0.36774294970814597, "calibration/coverage@10%": 0.6288286550676891, "calibration/coverage@15%": 0.7269702427332668, "calibration/coverage@20%": 0.8, "calibration/coverage@25%": 0.9700787401574804, "calibration/coverage@30%": 0.9853018372703412, "calibration/coverage@5%": 0.5576832474972546, "calibration/ece": 0.11785287988791912, "calibration/mean_confidence": 0.7586088433917471, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002256944444444442, "completions/max_length": 3706.6, "completions/max_terminated_length": 3706.6, "completions/mean_length": 906.5394165039063, "completions/mean_terminated_length": 908.6003784179687, "completions/min_length": 0.0, "completions/min_terminated_length": 266.6, "epoch": 1.03359958000525, "grad_norm": 0.00045757388579659164, "learning_rate": 3.665865384615385e-06, "loss": -0.0003, "num_tokens": 1026042758.0, "reward": 1.3354929208755493, "reward_std": 0.10449671745300293, "rewards/accuracy_reward": 0.7881944417953491, "rewards/brier_reward": 0.8850325345993042, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9977430462837219, "rewards/mean_confidence_reward": 0.7831640601158142, "signal/accuracy_reward/centered_abs_mean": 0.10152994841337204, "signal/accuracy_reward/group_std_mean": 0.13918909877538682, "signal/accuracy_reward/group_zero_std_frac": 0.5861111164093018, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05076497420668602, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05076497420668602, "signal/advantage_abs_mean": 0.0735987514257431, "signal/advantage_pre_scale_abs_mean": 0.0735987514257431, "signal/advantage_pre_scale_std": 0.1566575586795807, "signal/advantage_std": 0.1566575586795807, "signal/brier_reward/centered_abs_mean": 0.06317389458417892, "signal/brier_reward/group_std_mean": 0.08807524144649506, "signal/brier_reward/group_zero_std_frac": 0.16666666567325591, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03158694729208946, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03158694729208946, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.004351128474809229, "signal/format_reward/group_std_mean": 0.012169323675334454, "signal/format_reward/group_zero_std_frac": 0.9333333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0021755642374046147, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0021755642374046147, "signal/mean_confidence_reward/centered_abs_mean": 0.058977609127759935, "signal/mean_confidence_reward/group_std_mean": 0.07780204266309738, "signal/mean_confidence_reward/group_zero_std_frac": 0.16944444328546523, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.897760729567381e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.897760729567381e-07, "step": 430 }, { "calibration/aurc": 0.17303748565903715, "calibration/batch_distribution_entropy": 0.774028078201775, "calibration/confidence_entropy": 0.44050268439380547, "calibration/coverage@0%": 0.03439153439153439, "calibration/coverage@1%": 0.03439153439153439, "calibration/coverage@10%": 0.36305839754115615, "calibration/coverage@15%": 0.4388727492175768, "calibration/coverage@20%": 0.601967987312815, "calibration/coverage@25%": 0.6556828966850519, "calibration/coverage@30%": 0.7963167898564778, "calibration/coverage@5%": 0.2948044292871879, "calibration/ece": 0.13477044230209043, "calibration/mean_confidence": 0.7287672255361086, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0059895833333333485, "completions/max_length": 3817.0, "completions/max_terminated_length": 3817.0, "completions/mean_length": 955.967626953125, "completions/mean_terminated_length": 961.725537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 283.8, "epoch": 1.045599430007125, "grad_norm": 0.0004141830140724778, "learning_rate": 3.635817307692308e-06, "loss": -0.0056, "num_tokens": 1040140017.0, "reward": 1.2715159416198731, "reward_std": 0.11028102338314057, "rewards/accuracy_reward": 0.7049479007720947, "rewards/brier_reward": 0.8440587878227234, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9940104126930237, "rewards/mean_confidence_reward": 0.7376831650733948, "signal/accuracy_reward/centered_abs_mean": 0.10957573652267456, "signal/accuracy_reward/group_std_mean": 0.14774862676858902, "signal/accuracy_reward/group_zero_std_frac": 0.5638888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05478786826133728, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05478786826133728, "signal/advantage_abs_mean": 0.07771620154380798, "signal/advantage_pre_scale_abs_mean": 0.07771620154380798, "signal/advantage_pre_scale_std": 0.1612031042575836, "signal/advantage_std": 0.1612031042575836, "signal/brier_reward/centered_abs_mean": 0.07119961306452752, "signal/brier_reward/group_std_mean": 0.09745723158121108, "signal/brier_reward/group_zero_std_frac": 0.09722222462296486, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03559980653226376, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03559980653226376, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.010465494683012366, "signal/format_reward/group_std_mean": 0.020273290947079657, "signal/format_reward/group_zero_std_frac": 0.9138889074325561, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005232747341506183, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005232747341506183, "signal/mean_confidence_reward/centered_abs_mean": 0.06722435206174851, "signal/mean_confidence_reward/group_std_mean": 0.08768622428178788, "signal/mean_confidence_reward/group_zero_std_frac": 0.10000000223517418, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.722435159645102e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.722435159645102e-07, "step": 435 }, { "calibration/aurc": 0.13172015584112373, "calibration/batch_distribution_entropy": 0.7508988704899572, "calibration/confidence_entropy": 0.45066248515434515, "calibration/coverage@0%": 0.31783667123106774, "calibration/coverage@1%": 0.42632473489154527, "calibration/coverage@10%": 0.4901298985105803, "calibration/coverage@15%": 0.752693544934042, "calibration/coverage@20%": 0.8014608502503113, "calibration/coverage@25%": 0.8246919746919747, "calibration/coverage@30%": 0.8516483516483516, "calibration/coverage@5%": 0.46976574561801837, "calibration/ece": 0.12898603365099034, "calibration/mean_confidence": 0.7108380002759374, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011545138888888907, "completions/max_length": 3665.2, "completions/max_terminated_length": 3665.2, "completions/mean_length": 998.803466796875, "completions/mean_terminated_length": 1010.4798828125, "completions/min_length": 0.0, "completions/min_terminated_length": 302.8, "epoch": 1.057599280009, "grad_norm": 0.0003651871520560235, "learning_rate": 3.605769230769231e-06, "loss": -0.011, "num_tokens": 1054746361.0, "reward": 1.2987138032913208, "reward_std": 0.12152182906866074, "rewards/accuracy_reward": 0.7493923664093017, "rewards/brier_reward": 0.8596528172492981, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9883680462837219, "rewards/mean_confidence_reward": 0.716652762889862, "signal/accuracy_reward/centered_abs_mean": 0.11401367336511611, "signal/accuracy_reward/group_std_mean": 0.1540904313325882, "signal/accuracy_reward/group_zero_std_frac": 0.550000011920929, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05700683668255806, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05700683668255806, "signal/advantage_abs_mean": 0.08622867465019227, "signal/advantage_pre_scale_abs_mean": 0.08622867465019227, "signal/advantage_pre_scale_std": 0.1765172302722931, "signal/advantage_std": 0.1765172302722931, "signal/brier_reward/centered_abs_mean": 0.07706657350063324, "signal/brier_reward/group_std_mean": 0.10269050449132919, "signal/brier_reward/group_zero_std_frac": 0.08055555745959282, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03853328675031662, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03853328675031662, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.018760850839316844, "signal/format_reward/group_std_mean": 0.03368374258279801, "signal/format_reward/group_zero_std_frac": 0.8666666626930237, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.009380425419658422, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009380425419658422, "signal/mean_confidence_reward/centered_abs_mean": 0.0717637374997139, "signal/mean_confidence_reward/group_std_mean": 0.09581109434366226, "signal/mean_confidence_reward/group_zero_std_frac": 0.08333333656191826, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.176373401307501e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.176373401307501e-07, "step": 440 }, { "calibration/aurc": 0.11535288639000432, "calibration/batch_distribution_entropy": 0.7676501572988863, "calibration/confidence_entropy": 0.4211633931973693, "calibration/coverage@0%": 0.12859081969029612, "calibration/coverage@1%": 0.12859081969029612, "calibration/coverage@10%": 0.5449202620231814, "calibration/coverage@15%": 0.6303841577230134, "calibration/coverage@20%": 0.6745000873853833, "calibration/coverage@25%": 0.8864359587951618, "calibration/coverage@30%": 0.9434588376540883, "calibration/coverage@5%": 0.43218471787757873, "calibration/ece": 0.1018693186070927, "calibration/mean_confidence": 0.726212957166929, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010503472222222232, "completions/max_length": 3909.2, "completions/max_terminated_length": 3909.2, "completions/mean_length": 1011.0391723632813, "completions/mean_terminated_length": 1021.8754516601563, "completions/min_length": 0.0, "completions/min_terminated_length": 302.6, "epoch": 1.069599130010875, "grad_norm": 0.000393132446333766, "learning_rate": 3.575721153846154e-06, "loss": -0.0112, "num_tokens": 1069450556.0, "reward": 1.2810823678970338, "reward_std": 0.12434810400009155, "rewards/accuracy_reward": 0.7106770873069763, "rewards/brier_reward": 0.8621500372886658, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9893229126930236, "rewards/mean_confidence_reward": 0.7332525968551635, "signal/accuracy_reward/centered_abs_mean": 0.10539822280406952, "signal/accuracy_reward/group_std_mean": 0.14428263306617736, "signal/accuracy_reward/group_zero_std_frac": 0.5611111223697662, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05269911140203476, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05269911140203476, "signal/advantage_abs_mean": 0.08532515764236451, "signal/advantage_pre_scale_abs_mean": 0.08532515764236451, "signal/advantage_pre_scale_std": 0.18277325630187988, "signal/advantage_std": 0.18277325630187988, "signal/brier_reward/centered_abs_mean": 0.0792836882174015, "signal/brier_reward/group_std_mean": 0.11051161736249923, "signal/brier_reward/group_zero_std_frac": 0.1194444477558136, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03964184410870075, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03964184410870075, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.019287109375, "signal/format_reward/group_std_mean": 0.03896704204380512, "signal/format_reward/group_zero_std_frac": 0.8333333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0096435546875, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0096435546875, "signal/mean_confidence_reward/centered_abs_mean": 0.07209224328398704, "signal/mean_confidence_reward/group_std_mean": 0.09649368673563004, "signal/mean_confidence_reward/group_zero_std_frac": 0.12777777761220932, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.209224349935539e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.209224349935539e-07, "step": 445 }, { "calibration/aurc": 0.08796893010234426, "calibration/batch_distribution_entropy": 0.7076291446211235, "calibration/confidence_entropy": 0.384656312463077, "calibration/coverage@0%": 0.20570850202429153, "calibration/coverage@1%": 0.2131355577272093, "calibration/coverage@10%": 0.6097780701650695, "calibration/coverage@15%": 0.8129811542544545, "calibration/coverage@20%": 0.9011322047034515, "calibration/coverage@25%": 0.9741837071240106, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.3990812593677123, "calibration/ece": 0.08757120799564053, "calibration/mean_confidence": 0.7574029549068722, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01362847222222221, "completions/max_length": 3803.6, "completions/max_terminated_length": 3803.6, "completions/mean_length": 1083.1001342773438, "completions/mean_terminated_length": 1098.1322021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 320.8, "epoch": 1.08159898001275, "grad_norm": 0.0003348525206092745, "learning_rate": 3.5456730769230774e-06, "loss": -0.0149, "num_tokens": 1085031133.0, "reward": 1.2805572032928467, "reward_std": 0.1424509435892105, "rewards/accuracy_reward": 0.7157986044883728, "rewards/brier_reward": 0.8589296817779541, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9863715291023254, "rewards/mean_confidence_reward": 0.7274316906929016, "signal/accuracy_reward/centered_abs_mean": 0.11979166567325591, "signal/accuracy_reward/group_std_mean": 0.16534026563167573, "signal/accuracy_reward/group_zero_std_frac": 0.5055555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05989583283662796, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05989583283662796, "signal/advantage_abs_mean": 0.09766105860471726, "signal/advantage_pre_scale_abs_mean": 0.09766105860471726, "signal/advantage_pre_scale_std": 0.19776962101459503, "signal/advantage_std": 0.19776962101459503, "signal/brier_reward/centered_abs_mean": 0.09170636087656021, "signal/brier_reward/group_std_mean": 0.1260788545012474, "signal/brier_reward/group_zero_std_frac": 0.1361111134290695, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04585318043828011, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04585318043828011, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02412651889026165, "signal/format_reward/group_std_mean": 0.048982583731412885, "signal/format_reward/group_zero_std_frac": 0.7861111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.012063259445130824, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012063259445130824, "signal/mean_confidence_reward/centered_abs_mean": 0.0799168199300766, "signal/mean_confidence_reward/group_std_mean": 0.10652744621038437, "signal/mean_confidence_reward/group_zero_std_frac": 0.14166666865348815, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.991681627572689e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.991681627572689e-07, "step": 450 }, { "epoch": 1.08159898001275, "eval_calibration/aurc": 0.08213034082503025, "eval_calibration/batch_distribution_entropy": 0.6620134820916984, "eval_calibration/confidence_entropy": 0.3696003832513866, "eval_calibration/coverage@0%": 0.32106854838709675, "eval_calibration/coverage@1%": 0.32106854838709675, "eval_calibration/coverage@10%": 0.7182123655913979, "eval_calibration/coverage@15%": 0.8088037634408604, "eval_calibration/coverage@20%": 0.8674395161290324, "eval_calibration/coverage@25%": 0.9415322580645161, "eval_calibration/coverage@30%": 0.9630376344086021, "eval_calibration/coverage@5%": 0.48863127240143367, "eval_calibration/ece": 0.15153561827956993, "eval_calibration/mean_confidence": 0.7640591397849462, "eval_completions/clipped_ratio": 0.0248263888888889, "eval_completions/max_length": 3233.6666666666665, "eval_completions/max_terminated_length": 3233.6666666666665, "eval_completions/mean_length": 1038.7929077148438, "eval_completions/mean_terminated_length": 1064.9809265136719, "eval_completions/min_length": 0.0, "eval_completions/min_terminated_length": 290.1666666666667, "eval_loss": 0.0, "eval_num_tokens": 1085031133.0, "eval_reward": 1.265126625696818, "eval_reward_std": 0.35113013287385303, "eval_rewards/accuracy_reward": 0.7048611044883728, "eval_rewards/brier_reward": 0.8453425268332163, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9800347288449606, "eval_rewards/mean_confidence_reward": 0.7380034824212393, "eval_runtime": 216.6122, "eval_samples_per_second": 4.617, "eval_signal/accuracy_reward/centered_abs_mean": 0.4010416666666667, "eval_signal/accuracy_reward/group_std_mean": 0.4536542743444443, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.20052083333333334, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.20052083333333334, "eval_signal/advantage_abs_mean": 0.2878503352403641, "eval_signal/advantage_pre_scale_abs_mean": 0.2878503352403641, "eval_signal/advantage_pre_scale_std": 0.34932101766268414, "eval_signal/advantage_std": 0.34932101766268414, "eval_signal/brier_reward/centered_abs_mean": 0.1969408467411995, "eval_signal/brier_reward/group_std_mean": 0.2647774467865626, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09847042337059975, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.09847042337059975, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.038357205068071686, "eval_signal/format_reward/group_std_mean": 0.10397243872284889, "eval_signal/format_reward/group_zero_std_frac": 0.4444444477558136, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.019178602534035843, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.019178602534035843, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.2266856630643209, "eval_signal/mean_confidence_reward/group_std_mean": 0.2828233440717061, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.266856540700246e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.266856540700246e-06, "eval_steps_per_second": 0.028, "step": 450 }, { "epoch": 1.08159898001275, "step": 450, "train_probe_calibration/aurc": 0.14803249540243577, "train_probe_calibration/batch_distribution_entropy": 0.6559592419490077, "train_probe_calibration/confidence_entropy": 0.3785335538919464, "train_probe_calibration/coverage@0%": 0.06989247311827958, "train_probe_calibration/coverage@1%": 0.06989247311827958, "train_probe_calibration/coverage@10%": 0.27592965949820786, "train_probe_calibration/coverage@15%": 0.6208893369175628, "train_probe_calibration/coverage@20%": 0.8314404121863799, "train_probe_calibration/coverage@25%": 0.9102598566308244, "train_probe_calibration/coverage@30%": 0.9479166666666666, "train_probe_calibration/coverage@5%": 0.10752688172043011, "train_probe_calibration/ece": 0.13306451612903217, "train_probe_calibration/mean_confidence": 0.7622479838709677, "train_probe_completions/clipped_ratio": 0.01545138888888888, "train_probe_completions/max_length": 2931.1666666666665, "train_probe_completions/max_terminated_length": 2931.1666666666665, "train_probe_completions/mean_length": 1054.8274637858074, "train_probe_completions/mean_terminated_length": 1071.353983561198, "train_probe_completions/min_length": 0.0, "train_probe_completions/min_terminated_length": 315.1666666666667, "train_probe_loss": 0.0, "train_probe_num_tokens": 1085031133.0, "train_probe_reward": 1.295697530110677, "train_probe_reward_std": 0.3183877070744832, "train_probe_rewards/accuracy_reward": 0.7447916766007742, "train_probe_rewards/brier_reward": 0.8596091171105703, "train_probe_rewards/confidence_one_or_zero": 0.0008680555814256271, "train_probe_rewards/format_reward": 0.9869791567325592, "train_probe_rewards/mean_confidence_reward": 0.7478385667006174, "train_probe_runtime": 213.481, "train_probe_samples_per_second": 4.684, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3653428852558136, "train_probe_signal/accuracy_reward/group_std_mean": 0.4310339738925298, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1826714426279068, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.1826714426279068, "train_probe_signal/advantage_abs_mean": 0.2562904159228007, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2562904159228007, "train_probe_signal/advantage_pre_scale_std": 0.32024938861529034, "train_probe_signal/advantage_std": 0.32024938861529034, "train_probe_signal/brier_reward/centered_abs_mean": 0.17699170857667923, "train_probe_signal/brier_reward/group_std_mean": 0.24220533668994904, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08849585428833961, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.08849585428833961, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0016818575871487458, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0049104637776811915, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 0.9722222288449606, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151002e-08, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151002e-08, "train_probe_signal/format_reward/centered_abs_mean": 0.024685330068071682, "train_probe_signal/format_reward/group_std_mean": 0.058709911381204925, "train_probe_signal/format_reward/group_zero_std_frac": 0.7222222487131754, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.012342665034035841, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.012342665034035841, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.21301595866680145, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.266616885860761, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.1301596007106127e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.1301596007106127e-06, "train_probe_steps_per_second": 0.028 }, { "calibration/aurc": 0.14486493127178762, "calibration/batch_distribution_entropy": 0.6849579848554829, "calibration/confidence_entropy": 0.38067323640480727, "calibration/coverage@0%": 0.08157894736842106, "calibration/coverage@1%": 0.08157894736842106, "calibration/coverage@10%": 0.36475903489512496, "calibration/coverage@15%": 0.6107892810899059, "calibration/coverage@20%": 0.6891038148865499, "calibration/coverage@25%": 0.7929873655614527, "calibration/coverage@30%": 0.9131920463406484, "calibration/coverage@5%": 0.3205485085793355, "calibration/ece": 0.11820604838788473, "calibration/mean_confidence": 0.7483901553987021, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017100694444444443, "completions/max_length": 3880.6, "completions/max_terminated_length": 3880.6, "completions/mean_length": 1100.1171142578125, "completions/mean_terminated_length": 1119.1901611328126, "completions/min_length": 0.0, "completions/min_terminated_length": 309.2, "epoch": 1.0935988300146249, "grad_norm": 0.000438571791164577, "learning_rate": 3.5156250000000003e-06, "loss": -0.0177, "num_tokens": 1100816354.0, "reward": 1.2930595636367799, "reward_std": 0.14985843598842621, "rewards/accuracy_reward": 0.7456597208976745, "rewards/brier_reward": 0.8578055739402771, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9826388835906983, "rewards/mean_confidence_reward": 0.7443784594535827, "signal/accuracy_reward/centered_abs_mean": 0.12549913078546523, "signal/accuracy_reward/group_std_mean": 0.16759584248065948, "signal/accuracy_reward/group_zero_std_frac": 0.5138889014720917, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06274956539273262, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06274956539273262, "signal/advantage_abs_mean": 0.10355184078216553, "signal/advantage_pre_scale_abs_mean": 0.10355184078216553, "signal/advantage_pre_scale_std": 0.20846091508865355, "signal/advantage_std": 0.20846091508865355, "signal/brier_reward/centered_abs_mean": 0.09572955071926117, "signal/brier_reward/group_std_mean": 0.13273234963417052, "signal/brier_reward/group_zero_std_frac": 0.15, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04786477535963059, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04786477535963059, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02801649309694767, "signal/format_reward/group_std_mean": 0.055632898956537245, "signal/format_reward/group_zero_std_frac": 0.7611111164093017, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.014008246548473834, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.014008246548473834, "signal/mean_confidence_reward/centered_abs_mean": 0.08189736753702163, "signal/mean_confidence_reward/group_std_mean": 0.11145165711641311, "signal/mean_confidence_reward/group_zero_std_frac": 0.15, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.189736718122731e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.189736718122731e-07, "step": 455 }, { "calibration/aurc": 0.09784459316803407, "calibration/batch_distribution_entropy": 0.5565789003954587, "calibration/confidence_entropy": 0.33102373243444044, "calibration/coverage@0%": 0.005295788456824833, "calibration/coverage@1%": 0.005295788456824833, "calibration/coverage@10%": 0.678626690631282, "calibration/coverage@15%": 0.7302573546951785, "calibration/coverage@20%": 0.8549957794659914, "calibration/coverage@25%": 0.9650273224043715, "calibration/coverage@30%": 0.9857923497267759, "calibration/coverage@5%": 0.26503222223277206, "calibration/ece": 0.08287913990192171, "calibration/mean_confidence": 0.8266985641554336, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.019704861111111117, "completions/max_length": 3719.0, "completions/max_terminated_length": 3719.0, "completions/mean_length": 1068.5400390625, "completions/mean_terminated_length": 1089.921142578125, "completions/min_length": 0.0, "completions/min_terminated_length": 280.8, "epoch": 1.1055986800164999, "grad_norm": 0.0005805668188259006, "learning_rate": 3.4855769230769233e-06, "loss": -0.0215, "num_tokens": 1116195727.0, "reward": 1.2770362615585327, "reward_std": 0.15621853172779082, "rewards/accuracy_reward": 0.7355034828186036, "rewards/brier_reward": 0.8384317398071289, "rewards/confidence_one_or_zero": 0.0006944444554392249, "rewards/format_reward": 0.9801215410232544, "rewards/mean_confidence_reward": 0.7882052898406983, "signal/accuracy_reward/centered_abs_mean": 0.12322591245174408, "signal/accuracy_reward/group_std_mean": 0.16673467457294464, "signal/accuracy_reward/group_zero_std_frac": 0.5111111223697662, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06161295622587204, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06161295622587204, "signal/advantage_abs_mean": 0.11062140017747879, "signal/advantage_pre_scale_abs_mean": 0.11062140017747879, "signal/advantage_pre_scale_std": 0.22092588543891906, "signal/advantage_std": 0.22092588543891906, "signal/brier_reward/centered_abs_mean": 0.10055426955223083, "signal/brier_reward/group_std_mean": 0.13565018475055696, "signal/brier_reward/group_zero_std_frac": 0.2083333373069763, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.050277134776115416, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.050277134776115416, "signal/confidence_one_or_zero/centered_abs_mean": 0.0012803819379769265, "signal/confidence_one_or_zero/group_std_mean": 0.002897548582404852, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9861111044883728, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.2803818094653252e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.2803818094653252e-08, "signal/format_reward/centered_abs_mean": 0.0309950090944767, "signal/format_reward/group_std_mean": 0.05688217356801033, "signal/format_reward/group_zero_std_frac": 0.7666666746139527, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01549750454723835, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01549750454723835, "signal/mean_confidence_reward/centered_abs_mean": 0.07897297590970993, "signal/mean_confidence_reward/group_std_mean": 0.10757938027381897, "signal/mean_confidence_reward/group_zero_std_frac": 0.22222222089767457, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.897297678027826e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.897297678027826e-07, "step": 460 }, { "calibration/aurc": 0.10524791684697074, "calibration/batch_distribution_entropy": 0.5034653825337192, "calibration/confidence_entropy": 0.3041488369323679, "calibration/coverage@0%": 0.01396996261044246, "calibration/coverage@1%": 0.01396996261044246, "calibration/coverage@10%": 0.5778577830588459, "calibration/coverage@15%": 0.8322955129195574, "calibration/coverage@20%": 0.8861281819464949, "calibration/coverage@25%": 0.9698630136986301, "calibration/coverage@30%": 0.993972602739726, "calibration/coverage@5%": 0.15018617882665866, "calibration/ece": 0.1065183849970194, "calibration/mean_confidence": 0.8414365062842577, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02213541666666665, "completions/max_length": 4036.8, "completions/max_terminated_length": 4036.8, "completions/mean_length": 1073.0745849609375, "completions/mean_terminated_length": 1097.4721923828124, "completions/min_length": 0.0, "completions/min_terminated_length": 297.6, "epoch": 1.1175985300183748, "grad_norm": 0.00052850809879601, "learning_rate": 3.4555288461538466e-06, "loss": -0.0257, "num_tokens": 1131642250.0, "reward": 1.258783483505249, "reward_std": 0.16843646466732026, "rewards/accuracy_reward": 0.7051215171813965, "rewards/brier_reward": 0.8345649719238282, "rewards/confidence_one_or_zero": 0.00034722223062999547, "rewards/format_reward": 0.9778645992279053, "rewards/mean_confidence_reward": 0.7923576474189759, "signal/accuracy_reward/centered_abs_mean": 0.12906358540058135, "signal/accuracy_reward/group_std_mean": 0.17472618222236633, "signal/accuracy_reward/group_zero_std_frac": 0.48611111640930177, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06453179270029068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06453179270029068, "signal/advantage_abs_mean": 0.11907100975513459, "signal/advantage_pre_scale_abs_mean": 0.11907100975513459, "signal/advantage_pre_scale_std": 0.23408958911895753, "signal/advantage_std": 0.23408958911895753, "signal/brier_reward/centered_abs_mean": 0.1071409061551094, "signal/brier_reward/group_std_mean": 0.14336217045783997, "signal/brier_reward/group_zero_std_frac": 0.1444444477558136, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0535704530775547, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0535704530775547, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006618923391215504, "signal/confidence_one_or_zero/group_std_mean": 0.0016652446240186692, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.618923364953844e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.618923364953844e-09, "signal/format_reward/centered_abs_mean": 0.03630099818110466, "signal/format_reward/group_std_mean": 0.06572088748216628, "signal/format_reward/group_zero_std_frac": 0.7388888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01815049909055233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01815049909055233, "signal/mean_confidence_reward/centered_abs_mean": 0.08022120594978333, "signal/mean_confidence_reward/group_std_mean": 0.10990349501371384, "signal/mean_confidence_reward/group_zero_std_frac": 0.15277778208255768, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 8.0221208236253e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 8.0221208236253e-07, "step": 465 }, { "calibration/aurc": 0.07913108473440668, "calibration/batch_distribution_entropy": 0.6531839239628536, "calibration/confidence_entropy": 0.35972335462850424, "calibration/coverage@0%": 0.20957912413535382, "calibration/coverage@1%": 0.21011531716484444, "calibration/coverage@10%": 0.6533471002638437, "calibration/coverage@15%": 0.8473753137371496, "calibration/coverage@20%": 0.9476231551095099, "calibration/coverage@25%": 0.9884210526315791, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.40999648725214344, "calibration/ece": 0.06792838663137019, "calibration/mean_confidence": 0.7782400107527432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01987847222222221, "completions/max_length": 4026.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1074.6020874023438, "completions/mean_terminated_length": 1096.705126953125, "completions/min_length": 0.0, "completions/min_terminated_length": 311.4, "epoch": 1.1295983800202498, "grad_norm": 0.0005183502216823399, "learning_rate": 3.4254807692307695e-06, "loss": -0.0236, "num_tokens": 1147098754.0, "reward": 1.2714176654815674, "reward_std": 0.1467466503381729, "rewards/accuracy_reward": 0.7189236164093018, "rewards/brier_reward": 0.8438620448112488, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9800347208976745, "rewards/mean_confidence_reward": 0.7443752646446228, "signal/accuracy_reward/centered_abs_mean": 0.1118381068110466, "signal/accuracy_reward/group_std_mean": 0.15521054565906525, "signal/accuracy_reward/group_zero_std_frac": 0.5277777969837188, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0559190534055233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0559190534055233, "signal/advantage_abs_mean": 0.100922591984272, "signal/advantage_pre_scale_abs_mean": 0.100922591984272, "signal/advantage_pre_scale_std": 0.20572576820850372, "signal/advantage_std": 0.20572576820850372, "signal/brier_reward/centered_abs_mean": 0.09730305373668671, "signal/brier_reward/group_std_mean": 0.13238580524921417, "signal/brier_reward/group_zero_std_frac": 0.1583333358168602, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.048651526868343356, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.048651526868343356, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.03243272546678781, "signal/format_reward/group_std_mean": 0.05741032063961029, "signal/format_reward/group_zero_std_frac": 0.7777777791023255, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.016216362733393906, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.016216362733393906, "signal/mean_confidence_reward/centered_abs_mean": 0.07965410202741623, "signal/mean_confidence_reward/group_std_mean": 0.10579516440629959, "signal/mean_confidence_reward/group_zero_std_frac": 0.16944444477558135, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.9654097362436e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.9654097362436e-07, "step": 470 }, { "calibration/aurc": 0.11857188532983869, "calibration/batch_distribution_entropy": 0.7749914109094729, "calibration/confidence_entropy": 0.45146041634961237, "calibration/coverage@0%": 0.14216903336821493, "calibration/coverage@1%": 0.14216903336821493, "calibration/coverage@10%": 0.6626677812238765, "calibration/coverage@15%": 0.7538313027179352, "calibration/coverage@20%": 0.7762313027179351, "calibration/coverage@25%": 0.8533567599634546, "calibration/coverage@30%": 0.8810816943466566, "calibration/coverage@5%": 0.4054234090069261, "calibration/ece": 0.11782514448374282, "calibration/mean_confidence": 0.6272827531116163, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016145833333333325, "completions/max_length": 3930.0, "completions/max_terminated_length": 3930.0, "completions/mean_length": 1129.5532470703124, "completions/mean_terminated_length": 1148.1423583984374, "completions/min_length": 0.0, "completions/min_terminated_length": 323.4, "epoch": 1.1415982300221248, "grad_norm": 0.00029066819115541875, "learning_rate": 3.3954326923076925e-06, "loss": -0.0188, "num_tokens": 1163200935.0, "reward": 1.278049683570862, "reward_std": 0.12860337644815445, "rewards/accuracy_reward": 0.7177951335906982, "rewards/brier_reward": 0.8544366955757141, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9838541626930237, "rewards/mean_confidence_reward": 0.6632643342018127, "signal/accuracy_reward/centered_abs_mean": 0.11060655415058136, "signal/accuracy_reward/group_std_mean": 0.15503691434860228, "signal/accuracy_reward/group_zero_std_frac": 0.5305555641651154, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05530327707529068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05530327707529068, "signal/advantage_abs_mean": 0.08601635247468949, "signal/advantage_pre_scale_abs_mean": 0.08601635247468949, "signal/advantage_pre_scale_std": 0.18358878493309022, "signal/advantage_std": 0.18358878493309022, "signal/brier_reward/centered_abs_mean": 0.08666091859340667, "signal/brier_reward/group_std_mean": 0.11972427815198898, "signal/brier_reward/group_zero_std_frac": 0.08611111268401146, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04333045929670334, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04333045929670334, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.02725694477558136, "signal/format_reward/group_std_mean": 0.04947251603007317, "signal/format_reward/group_zero_std_frac": 0.8027777791023254, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.01362847238779068, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01362847238779068, "signal/mean_confidence_reward/centered_abs_mean": 0.07710379660129547, "signal/mean_confidence_reward/group_std_mean": 0.10281234383583068, "signal/mean_confidence_reward/group_zero_std_frac": 0.08888889029622078, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.710379804848344e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.710379804848344e-07, "step": 475 }, { "calibration/aurc": 0.14287097408671662, "calibration/batch_distribution_entropy": 0.7576410899883392, "calibration/confidence_entropy": 0.47943500603439365, "calibration/coverage@0%": 0.1401903600044483, "calibration/coverage@1%": 0.1401903600044483, "calibration/coverage@10%": 0.3280267828146931, "calibration/coverage@15%": 0.45132930204410526, "calibration/coverage@20%": 0.7416074713456912, "calibration/coverage@25%": 0.8366294042733833, "calibration/coverage@30%": 0.9403141361256544, "calibration/coverage@5%": 0.21154171135579963, "calibration/ece": 0.10323267006839787, "calibration/mean_confidence": 0.6980364545098185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0073784722222222324, "completions/max_length": 3827.6, "completions/max_terminated_length": 3827.6, "completions/mean_length": 999.9655395507813, "completions/mean_terminated_length": 1007.497607421875, "completions/min_length": 0.0, "completions/min_terminated_length": 330.8, "epoch": 1.1535980800239998, "grad_norm": 0.0004283481393940747, "learning_rate": 3.365384615384616e-06, "loss": -0.0068, "num_tokens": 1177813114.0, "reward": 1.3071688652038573, "reward_std": 0.11388030797243118, "rewards/accuracy_reward": 0.7558159828186035, "rewards/brier_reward": 0.8659730315208435, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9925347208976746, "rewards/mean_confidence_reward": 0.6916906833648682, "signal/accuracy_reward/centered_abs_mean": 0.10970594584941865, "signal/accuracy_reward/group_std_mean": 0.1493976280093193, "signal/accuracy_reward/group_zero_std_frac": 0.5527777969837189, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05485297292470932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05485297292470932, "signal/advantage_abs_mean": 0.07888792902231216, "signal/advantage_pre_scale_abs_mean": 0.07888792902231216, "signal/advantage_pre_scale_std": 0.164822319149971, "signal/advantage_std": 0.164822319149971, "signal/brier_reward/centered_abs_mean": 0.06824958994984627, "signal/brier_reward/group_std_mean": 0.09514854848384857, "signal/brier_reward/group_zero_std_frac": 0.08888889104127884, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.034124794974923135, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.034124794974923135, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.013368055783212185, "signal/format_reward/group_std_mean": 0.028724368289113046, "signal/format_reward/group_zero_std_frac": 0.8694444417953491, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006684027891606092, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006684027891606092, "signal/mean_confidence_reward/centered_abs_mean": 0.0640217125415802, "signal/mean_confidence_reward/group_std_mean": 0.08547939211130143, "signal/mean_confidence_reward/group_zero_std_frac": 0.09166666865348816, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.402171038644156e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.402171038644156e-07, "step": 480 }, { "calibration/aurc": 0.15693460480124644, "calibration/batch_distribution_entropy": 0.6853468226250661, "calibration/confidence_entropy": 0.4936850036471796, "calibration/coverage@0%": 0.03036495243292029, "calibration/coverage@1%": 0.03036495243292029, "calibration/coverage@10%": 0.20200531424805837, "calibration/coverage@15%": 0.5493835678939293, "calibration/coverage@20%": 0.7012183386852635, "calibration/coverage@25%": 0.8283185565652953, "calibration/coverage@30%": 0.8941080302495059, "calibration/coverage@5%": 0.20200531424805837, "calibration/ece": 0.11970637355968967, "calibration/mean_confidence": 0.7187880769979726, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010156249999999978, "completions/max_length": 3811.2, "completions/max_terminated_length": 3811.2, "completions/mean_length": 972.5010498046875, "completions/mean_terminated_length": 982.6981079101563, "completions/min_length": 0.0, "completions/min_terminated_length": 295.8, "epoch": 1.1655979300258748, "grad_norm": 0.00048771046567708254, "learning_rate": 3.3353365384615388e-06, "loss": -0.0109, "num_tokens": 1192131942.0, "reward": 1.2860528469085692, "reward_std": 0.12357107102870941, "rewards/accuracy_reward": 0.7283854126930237, "rewards/brier_reward": 0.8538621544837952, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9898437619209289, "rewards/mean_confidence_reward": 0.7169571995735169, "signal/accuracy_reward/centered_abs_mean": 0.11087782084941863, "signal/accuracy_reward/group_std_mean": 0.14915724247694015, "signal/accuracy_reward/group_zero_std_frac": 0.5666666805744172, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05543891042470932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05543891042470932, "signal/advantage_abs_mean": 0.0851035937666893, "signal/advantage_pre_scale_abs_mean": 0.0851035937666893, "signal/advantage_pre_scale_std": 0.17648011147975923, "signal/advantage_std": 0.17648011147975923, "signal/brier_reward/centered_abs_mean": 0.07257164269685745, "signal/brier_reward/group_std_mean": 0.10160021483898163, "signal/brier_reward/group_zero_std_frac": 0.09444444626569748, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036285821348428726, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036285821348428726, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.018310546688735486, "signal/format_reward/group_std_mean": 0.038288489356637, "signal/format_reward/group_zero_std_frac": 0.8305555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.009155273344367743, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009155273344367743, "signal/mean_confidence_reward/centered_abs_mean": 0.06302084624767304, "signal/mean_confidence_reward/group_std_mean": 0.08450654447078705, "signal/mean_confidence_reward/group_zero_std_frac": 0.09444444626569748, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.302084443632338e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.302084443632338e-07, "step": 485 }, { "calibration/aurc": 0.11486159778046853, "calibration/batch_distribution_entropy": 0.6624061820347166, "calibration/confidence_entropy": 0.39815207109176753, "calibration/coverage@0%": 0.17049049324882887, "calibration/coverage@1%": 0.17049049324882887, "calibration/coverage@10%": 0.49475179255700763, "calibration/coverage@15%": 0.6361186825719511, "calibration/coverage@20%": 0.7465725965686932, "calibration/coverage@25%": 0.9795823824893951, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.2935430620693712, "calibration/ece": 0.10675014719871163, "calibration/mean_confidence": 0.8032809360758473, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010763888888888906, "completions/max_length": 3375.4, "completions/max_terminated_length": 3375.4, "completions/mean_length": 943.4421997070312, "completions/mean_terminated_length": 953.8194702148437, "completions/min_length": 0.0, "completions/min_terminated_length": 295.6, "epoch": 1.1775977800277497, "grad_norm": 0.0004896249738521874, "learning_rate": 3.3052884615384617e-06, "loss": -0.0102, "num_tokens": 1206095084.0, "reward": 1.2926331996917724, "reward_std": 0.1452304095029831, "rewards/accuracy_reward": 0.736805546283722, "rewards/brier_reward": 0.859295642375946, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9891493082046509, "rewards/mean_confidence_reward": 0.7859322905540467, "signal/accuracy_reward/centered_abs_mean": 0.12227647751569748, "signal/accuracy_reward/group_std_mean": 0.16397155821323395, "signal/accuracy_reward/group_zero_std_frac": 0.5166666746139527, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06113823875784874, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06113823875784874, "signal/advantage_abs_mean": 0.10174858123064041, "signal/advantage_pre_scale_abs_mean": 0.10174858123064041, "signal/advantage_pre_scale_std": 0.2041553348302841, "signal/advantage_std": 0.2041553348302841, "signal/brier_reward/centered_abs_mean": 0.07990672290325165, "signal/brier_reward/group_std_mean": 0.11316336095333099, "signal/brier_reward/group_zero_std_frac": 0.1638888895511627, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03995336145162583, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03995336145162583, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.019048394076526164, "signal/format_reward/group_std_mean": 0.04067052379250526, "signal/format_reward/group_zero_std_frac": 0.8138889074325562, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.009524197038263082, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009524197038263082, "signal/mean_confidence_reward/centered_abs_mean": 0.05718935951590538, "signal/mean_confidence_reward/group_std_mean": 0.07908675074577332, "signal/mean_confidence_reward/group_zero_std_frac": 0.16666666567325591, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.718935710774531e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.718935710774531e-07, "step": 490 }, { "calibration/aurc": 0.1721780673019536, "calibration/batch_distribution_entropy": 0.4579305251916061, "calibration/confidence_entropy": 0.30944282969731524, "calibration/coverage@0%": 0.12965469160104987, "calibration/coverage@1%": 0.12965469160104987, "calibration/coverage@10%": 0.3214366588029303, "calibration/coverage@15%": 0.5313855273843054, "calibration/coverage@20%": 0.7017125470282272, "calibration/coverage@25%": 0.756135770234987, "calibration/coverage@30%": 0.7791122715404699, "calibration/coverage@5%": 0.12965469160104987, "calibration/ece": 0.18617709542911232, "calibration/mean_confidence": 0.8732151285228765, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008420138888888883, "completions/max_length": 3787.2, "completions/max_terminated_length": 3787.2, "completions/mean_length": 902.4283203125, "completions/mean_terminated_length": 910.041552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 289.2, "epoch": 1.1895976300296247, "grad_norm": 0.0005582338199019432, "learning_rate": 3.2752403846153846e-06, "loss": -0.0069, "num_tokens": 1219570098.0, "reward": 1.2696094989776612, "reward_std": 0.13646844774484634, "rewards/accuracy_reward": 0.7230034828186035, "rewards/brier_reward": 0.824705171585083, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9914930462837219, "rewards/mean_confidence_reward": 0.862151050567627, "signal/accuracy_reward/centered_abs_mean": 0.10340169370174408, "signal/accuracy_reward/group_std_mean": 0.14321069717407225, "signal/accuracy_reward/group_zero_std_frac": 0.5638889014720917, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05170084685087204, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05170084685087204, "signal/advantage_abs_mean": 0.09558721035718917, "signal/advantage_pre_scale_abs_mean": 0.09558721035718917, "signal/advantage_pre_scale_std": 0.1999672532081604, "signal/advantage_std": 0.1999672532081604, "signal/brier_reward/centered_abs_mean": 0.0856032207608223, "signal/brier_reward/group_std_mean": 0.11895442605018616, "signal/brier_reward/group_zero_std_frac": 0.3333333432674408, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04280161038041115, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04280161038041115, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.013867187104187906, "signal/format_reward/group_std_mean": 0.02801370806992054, "signal/format_reward/group_zero_std_frac": 0.8777777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006933593552093953, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006933593552093953, "signal/mean_confidence_reward/centered_abs_mean": 0.04367283061146736, "signal/mean_confidence_reward/group_std_mean": 0.06191684454679489, "signal/mean_confidence_reward/group_zero_std_frac": 0.38055555820465087, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.3672828269336607e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.3672828269336607e-07, "step": 495 }, { "calibration/aurc": 0.18218693610495315, "calibration/batch_distribution_entropy": 0.44690489820666845, "calibration/confidence_entropy": 0.2991156609412481, "calibration/coverage@0%": 0.007820702099737533, "calibration/coverage@1%": 0.007820702099737533, "calibration/coverage@10%": 0.2671875, "calibration/coverage@15%": 0.4666666666666666, "calibration/coverage@20%": 0.5456077755905512, "calibration/coverage@25%": 0.718537859007833, "calibration/coverage@30%": 0.861695753744675, "calibration/coverage@5%": 0.13490403543307086, "calibration/ece": 0.18160425872247152, "calibration/mean_confidence": 0.8769006599630138, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004600694444444442, "completions/max_length": 3656.8, "completions/max_terminated_length": 3656.8, "completions/mean_length": 892.4573852539063, "completions/mean_terminated_length": 896.5928344726562, "completions/min_length": 0.0, "completions/min_terminated_length": 314.2, "epoch": 1.2015974800314997, "grad_norm": 0.0006056484417058527, "learning_rate": 3.245192307692308e-06, "loss": -0.0051, "num_tokens": 1232904711.0, "reward": 1.2776667833328248, "reward_std": 0.13408059030771255, "rewards/accuracy_reward": 0.7296006917953491, "rewards/brier_reward": 0.8304029703140259, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.995312488079071, "rewards/mean_confidence_reward": 0.8629522681236267, "signal/accuracy_reward/centered_abs_mean": 0.11220160573720932, "signal/accuracy_reward/group_std_mean": 0.1495628148317337, "signal/accuracy_reward/group_zero_std_frac": 0.569444453716278, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05610080286860466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05610080286860466, "signal/advantage_abs_mean": 0.09804172962903976, "signal/advantage_pre_scale_abs_mean": 0.09804172962903976, "signal/advantage_pre_scale_std": 0.20113952159881593, "signal/advantage_std": 0.20113952159881593, "signal/brier_reward/centered_abs_mean": 0.08865490555763245, "signal/brier_reward/group_std_mean": 0.1197259098291397, "signal/brier_reward/group_zero_std_frac": 0.32500000596046447, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04432745277881622, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04432745277881622, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.008355034701526164, "signal/format_reward/group_std_mean": 0.01793058905750513, "signal/format_reward/group_zero_std_frac": 0.9166666865348816, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004177517350763082, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004177517350763082, "signal/mean_confidence_reward/centered_abs_mean": 0.044824466854333875, "signal/mean_confidence_reward/group_std_mean": 0.06038721278309822, "signal/mean_confidence_reward/group_zero_std_frac": 0.39166666865348815, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.4824465703641183e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.4824465703641183e-07, "step": 500 }, { "epoch": 1.2015974800314997, "eval_calibration/aurc": 0.14758615408766326, "eval_calibration/batch_distribution_entropy": 0.4141658432412414, "eval_calibration/confidence_entropy": 0.29144892590637456, "eval_calibration/coverage@0%": 0.005208333333333333, "eval_calibration/coverage@1%": 0.005208333333333333, "eval_calibration/coverage@10%": 0.3854166666666667, "eval_calibration/coverage@15%": 0.6510416666666666, "eval_calibration/coverage@20%": 0.7916666666666666, "eval_calibration/coverage@25%": 0.8333333333333334, "eval_calibration/coverage@30%": 0.953125, "eval_calibration/coverage@5%": 0.125, "eval_calibration/ece": 0.1689062499999999, "eval_calibration/mean_confidence": 0.87203125, "eval_completions/clipped_ratio": 0.0017361111111111234, "eval_completions/max_length": 2803.3333333333335, "eval_completions/max_terminated_length": 2803.3333333333335, "eval_completions/mean_length": 879.5210266113281, "eval_completions/mean_terminated_length": 881.0595296223959, "eval_completions/min_length": 212.66666666666666, "eval_completions/min_terminated_length": 292.1666666666667, "eval_loss": 0.0, "eval_num_tokens": 1232904711.0, "eval_reward": 1.2630668878555298, "eval_reward_std": 0.3719491461912791, "eval_rewards/accuracy_reward": 0.7178819477558136, "eval_rewards/brier_reward": 0.8099703788757324, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9982638955116272, "eval_rewards/mean_confidence_reward": 0.8715017139911652, "eval_runtime": 159.5509, "eval_samples_per_second": 6.268, "eval_signal/accuracy_reward/centered_abs_mean": 0.39111328125, "eval_signal/accuracy_reward/group_std_mean": 0.44743746519088745, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.195556640625, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.195556640625, "eval_signal/advantage_abs_mean": 0.31733184556166333, "eval_signal/advantage_pre_scale_abs_mean": 0.31733184556166333, "eval_signal/advantage_pre_scale_std": 0.36861565212408703, "eval_signal/advantage_std": 0.36861565212408703, "eval_signal/brier_reward/centered_abs_mean": 0.25390299409627914, "eval_signal/brier_reward/group_std_mean": 0.3172287692626317, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.12695149704813957, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.12695149704813957, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.0033637151742974916, "eval_signal/format_reward/group_std_mean": 0.009820927555362383, "eval_signal/format_reward/group_zero_std_frac": 0.944444457689921, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.10768719141681989, "eval_signal/mean_confidence_reward/group_std_mean": 0.1588304415345192, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.0768718728589495e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.0768718728589495e-06, "eval_steps_per_second": 0.038, "step": 500 }, { "epoch": 1.2015974800314997, "step": 500, "train_probe_calibration/aurc": 0.2180504117898677, "train_probe_calibration/batch_distribution_entropy": 0.4334598712151929, "train_probe_calibration/confidence_entropy": 0.30623673858051076, "train_probe_calibration/coverage@0%": 0.0, "train_probe_calibration/coverage@1%": 0.0, "train_probe_calibration/coverage@10%": 0.13541666666666666, "train_probe_calibration/coverage@15%": 0.2552083333333333, "train_probe_calibration/coverage@20%": 0.5, "train_probe_calibration/coverage@25%": 0.59375, "train_probe_calibration/coverage@30%": 0.8541666666666666, "train_probe_calibration/coverage@5%": 0.11979166666666667, "train_probe_calibration/ece": 0.2208333333333333, "train_probe_calibration/mean_confidence": 0.8666666666666666, "train_probe_completions/clipped_ratio": 0.002604166666666685, "train_probe_completions/max_length": 2612.6666666666665, "train_probe_completions/max_terminated_length": 2612.6666666666665, "train_probe_completions/mean_length": 886.4597473144531, "train_probe_completions/mean_terminated_length": 888.8030293782552, "train_probe_completions/min_length": 143.5, "train_probe_completions/min_terminated_length": 277.0, "train_probe_loss": 0.0, "train_probe_num_tokens": 1232904711.0, "train_probe_reward": 1.2819882829984028, "train_probe_reward_std": 0.3565460095802943, "train_probe_rewards/accuracy_reward": 0.7352430621782938, "train_probe_rewards/brier_reward": 0.8313202261924744, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9973958432674408, "train_probe_rewards/mean_confidence_reward": 0.8690711955229441, "train_probe_runtime": 173.2431, "train_probe_samples_per_second": 5.772, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3763563384612401, "train_probe_signal/accuracy_reward/group_std_mean": 0.43859484791755676, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.18817816923062006, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.18817816923062006, "train_probe_signal/advantage_abs_mean": 0.2991163084904353, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2991163084904353, "train_probe_signal/advantage_pre_scale_std": 0.35482022166252136, "train_probe_signal/advantage_std": 0.35482022166252136, "train_probe_signal/brier_reward/centered_abs_mean": 0.22763003905614218, "train_probe_signal/brier_reward/group_std_mean": 0.29305944343407947, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.11381501952807109, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.11381501952807109, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.0050455727614462376, "train_probe_signal/format_reward/group_std_mean": 0.014731391333043575, "train_probe_signal/format_reward/group_zero_std_frac": 0.9166666865348816, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0025227863807231188, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.0025227863807231188, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.10988824317852657, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.1606998418768247, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.0988824215019122e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.0988824215019122e-06, "train_probe_steps_per_second": 0.035 }, { "calibration/aurc": 0.18286692170854216, "calibration/batch_distribution_entropy": 0.4862683886848287, "calibration/confidence_entropy": 0.3050353892485435, "calibration/coverage@0%": 0.0026123472949389176, "calibration/coverage@1%": 0.18229984729493892, "calibration/coverage@10%": 0.30994491710296684, "calibration/coverage@15%": 0.3455470113438045, "calibration/coverage@20%": 0.5065717713787086, "calibration/coverage@25%": 0.7071962260034904, "calibration/coverage@30%": 0.7661458333333333, "calibration/coverage@5%": 0.1869873472949389, "calibration/ece": 0.17578446771378708, "calibration/mean_confidence": 0.8494795484293194, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002083333333333348, "completions/max_length": 3206.4, "completions/max_terminated_length": 3206.4, "completions/mean_length": 903.7861206054688, "completions/mean_terminated_length": 905.6838256835938, "completions/min_length": 0.0, "completions/min_terminated_length": 243.0, "epoch": 1.2135973300333747, "grad_norm": 0.0005444893031381071, "learning_rate": 3.215144230769231e-06, "loss": -0.002, "num_tokens": 1246436519.0, "reward": 1.27201669216156, "reward_std": 0.13511895537376403, "rewards/accuracy_reward": 0.7149305582046509, "rewards/brier_reward": 0.8311691761016846, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9979166746139526, "rewards/mean_confidence_reward": 0.8521206498146057, "signal/accuracy_reward/centered_abs_mean": 0.11154514104127884, "signal/accuracy_reward/group_std_mean": 0.1528396040201187, "signal/accuracy_reward/group_zero_std_frac": 0.5388888895511628, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05577257052063942, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05577257052063942, "signal/advantage_abs_mean": 0.09784860163927078, "signal/advantage_pre_scale_abs_mean": 0.09784860163927078, "signal/advantage_pre_scale_std": 0.19753907024860382, "signal/advantage_std": 0.19753907024860382, "signal/brier_reward/centered_abs_mean": 0.09021688997745514, "signal/brier_reward/group_std_mean": 0.1227357879281044, "signal/brier_reward/group_zero_std_frac": 0.27499999701976774, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04510844498872757, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04510844498872757, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.00392795130610466, "signal/format_reward/group_std_mean": 0.009288318641483783, "signal/format_reward/group_zero_std_frac": 0.955555546283722, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00196397565305233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00196397565305233, "signal/mean_confidence_reward/centered_abs_mean": 0.04561000131070614, "signal/mean_confidence_reward/group_std_mean": 0.06142571866512299, "signal/mean_confidence_reward/group_zero_std_frac": 0.32222222685813906, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.561000082503597e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.561000082503597e-07, "step": 505 }, { "calibration/aurc": 0.17313702294842698, "calibration/batch_distribution_entropy": 0.6568670793665856, "calibration/confidence_entropy": 0.3682684877999604, "calibration/coverage@0%": 0.10054412303604965, "calibration/coverage@1%": 0.10054412303604965, "calibration/coverage@10%": 0.3931202905272319, "calibration/coverage@15%": 0.421820518986762, "calibration/coverage@20%": 0.558316928908433, "calibration/coverage@25%": 0.8208376276854015, "calibration/coverage@30%": 0.9208059210526315, "calibration/coverage@5%": 0.124502456369383, "calibration/ece": 0.15794727265700148, "calibration/mean_confidence": 0.7888638763398379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3336.6, "completions/max_terminated_length": 3336.6, "completions/mean_length": 885.5307373046875, "completions/mean_terminated_length": 889.0730834960938, "completions/min_length": 0.0, "completions/min_terminated_length": 270.2, "epoch": 1.2255971800352496, "grad_norm": 0.00041673812665976584, "learning_rate": 3.185096153846154e-06, "loss": -0.0035, "num_tokens": 1259772521.0, "reward": 1.261064338684082, "reward_std": 0.13758435100317, "rewards/accuracy_reward": 0.6973958253860474, "rewards/brier_reward": 0.8286231637001038, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9960937619209289, "rewards/mean_confidence_reward": 0.7986258625984192, "signal/accuracy_reward/centered_abs_mean": 0.11393229365348816, "signal/accuracy_reward/group_std_mean": 0.1542375683784485, "signal/accuracy_reward/group_zero_std_frac": 0.55277778506279, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05696614682674408, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05696614682674408, "signal/advantage_abs_mean": 0.10055895000696183, "signal/advantage_pre_scale_abs_mean": 0.10055895000696183, "signal/advantage_pre_scale_std": 0.19419675469398498, "signal/advantage_std": 0.19419675469398498, "signal/brier_reward/centered_abs_mean": 0.09574300199747085, "signal/brier_reward/group_std_mean": 0.1277709424495697, "signal/brier_reward/group_zero_std_frac": 0.22500000298023223, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.047871500998735425, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.047871500998735425, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.007112630037590861, "signal/format_reward/group_std_mean": 0.015296760015189648, "signal/format_reward/group_zero_std_frac": 0.9305555701255799, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0035563150187954305, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0035563150187954305, "signal/mean_confidence_reward/centered_abs_mean": 0.05962920114398003, "signal/mean_confidence_reward/group_std_mean": 0.07785041034221649, "signal/mean_confidence_reward/group_zero_std_frac": 0.2583333343267441, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.962919544799661e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.962919544799661e-07, "step": 510 }, { "calibration/aurc": 0.15909262788038847, "calibration/batch_distribution_entropy": 0.7742407519854134, "calibration/confidence_entropy": 0.4172838669801112, "calibration/coverage@0%": 0.15583366369116783, "calibration/coverage@1%": 0.15583366369116783, "calibration/coverage@10%": 0.42869392745341955, "calibration/coverage@15%": 0.6183023081760146, "calibration/coverage@20%": 0.6888122325810289, "calibration/coverage@25%": 0.7180156657963446, "calibration/coverage@30%": 0.8896422566401924, "calibration/coverage@5%": 0.2709378303578345, "calibration/ece": 0.11066388140837242, "calibration/mean_confidence": 0.7183388572865546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006423611111111116, "completions/max_length": 3773.0, "completions/max_terminated_length": 3773.0, "completions/mean_length": 926.8890747070312, "completions/mean_terminated_length": 932.8871215820312, "completions/min_length": 0.0, "completions/min_terminated_length": 267.0, "epoch": 1.2375970300371246, "grad_norm": 0.00036098036798648536, "learning_rate": 3.1550480769230772e-06, "loss": -0.0043, "num_tokens": 1273574923.0, "reward": 1.2779627561569213, "reward_std": 0.12070412933826447, "rewards/accuracy_reward": 0.6998263835906983, "rewards/brier_reward": 0.8625087261199951, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9935763835906982, "rewards/mean_confidence_reward": 0.6975512146949768, "signal/accuracy_reward/centered_abs_mean": 0.12925347089767455, "signal/accuracy_reward/group_std_mean": 0.1685749977827072, "signal/accuracy_reward/group_zero_std_frac": 0.5250000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06462673544883728, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06462673544883728, "signal/advantage_abs_mean": 0.08825572282075882, "signal/advantage_pre_scale_abs_mean": 0.08825572282075882, "signal/advantage_pre_scale_std": 0.1734816163778305, "signal/advantage_std": 0.1734816163778305, "signal/brier_reward/centered_abs_mean": 0.0780844047665596, "signal/brier_reward/group_std_mean": 0.10564509630203248, "signal/brier_reward/group_zero_std_frac": 0.11666666716337204, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0390422023832798, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0390422023832798, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.010991753544658422, "signal/format_reward/group_std_mean": 0.020391251146793365, "signal/format_reward/group_zero_std_frac": 0.9166666865348816, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005495876772329211, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005495876772329211, "signal/mean_confidence_reward/centered_abs_mean": 0.07218203395605087, "signal/mean_confidence_reward/group_std_mean": 0.09279810190200806, "signal/mean_confidence_reward/group_zero_std_frac": 0.11944444477558136, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.218203109005116e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.218203109005116e-07, "step": 515 }, { "calibration/aurc": 0.10688721270823924, "calibration/batch_distribution_entropy": 0.8174157612463908, "calibration/confidence_entropy": 0.46762002451969503, "calibration/coverage@0%": 0.1616079199303742, "calibration/coverage@1%": 0.19962875326370755, "calibration/coverage@10%": 0.525788729329852, "calibration/coverage@15%": 0.6514686684073108, "calibration/coverage@20%": 0.8035873585726719, "calibration/coverage@25%": 0.876000870322019, "calibration/coverage@30%": 0.9364583333333334, "calibration/coverage@5%": 0.475788729329852, "calibration/ece": 0.11101623694516975, "calibration/mean_confidence": 0.6826906549173194, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002951388888888906, "completions/max_length": 3631.4, "completions/max_terminated_length": 3631.4, "completions/mean_length": 929.4780517578125, "completions/mean_terminated_length": 932.2742797851563, "completions/min_length": 0.0, "completions/min_terminated_length": 286.8, "epoch": 1.2495968800389996, "grad_norm": 0.0003078638401348144, "learning_rate": 3.125e-06, "loss": 0.0006, "num_tokens": 1287359022.0, "reward": 1.2903466701507569, "reward_std": 0.11199667304754257, "rewards/accuracy_reward": 0.7249131917953491, "rewards/brier_reward": 0.8587177991867065, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9970486044883728, "rewards/mean_confidence_reward": 0.6821267247200012, "signal/accuracy_reward/centered_abs_mean": 0.12643771916627883, "signal/accuracy_reward/group_std_mean": 0.1737253338098526, "signal/accuracy_reward/group_zero_std_frac": 0.4777777850627899, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06321885958313941, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06321885958313941, "signal/advantage_abs_mean": 0.07889304161071778, "signal/advantage_pre_scale_abs_mean": 0.07889304161071778, "signal/advantage_pre_scale_std": 0.15526196658611296, "signal/advantage_std": 0.15526196658611296, "signal/brier_reward/centered_abs_mean": 0.07279539853334427, "signal/brier_reward/group_std_mean": 0.10007892549037933, "signal/brier_reward/group_zero_std_frac": 0.0472222238779068, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036397699266672134, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036397699266672134, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.005164930480532348, "signal/format_reward/group_std_mean": 0.011545911617577075, "signal/format_reward/group_zero_std_frac": 0.9444444417953491, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002582465240266174, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002582465240266174, "signal/mean_confidence_reward/centered_abs_mean": 0.07087283730506896, "signal/mean_confidence_reward/group_std_mean": 0.09349334537982941, "signal/mean_confidence_reward/group_zero_std_frac": 0.05000000149011612, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.087283165674308e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.087283165674308e-07, "step": 520 }, { "calibration/aurc": 0.0733175019363381, "calibration/batch_distribution_entropy": 0.7670812630229238, "calibration/confidence_entropy": 0.44559995144875675, "calibration/coverage@0%": 0.12074870261837305, "calibration/coverage@1%": 0.12074870261837305, "calibration/coverage@10%": 0.777211700097537, "calibration/coverage@15%": 0.848670739532315, "calibration/coverage@20%": 0.899070207122298, "calibration/coverage@25%": 0.916046334443253, "calibration/coverage@30%": 0.9447697386985722, "calibration/coverage@5%": 0.6007016912109344, "calibration/ece": 0.11923801868310606, "calibration/mean_confidence": 0.7233329389336098, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005208333333333348, "completions/max_length": 3688.6, "completions/max_terminated_length": 3688.6, "completions/mean_length": 937.0691772460938, "completions/mean_terminated_length": 941.9866088867187, "completions/min_length": 0.0, "completions/min_terminated_length": 253.2, "epoch": 1.2615967300408744, "grad_norm": 0.0003549282264430076, "learning_rate": 3.094951923076923e-06, "loss": -0.0041, "num_tokens": 1301287915.0, "reward": 1.307690715789795, "reward_std": 0.12226983457803726, "rewards/accuracy_reward": 0.7506076335906983, "rewards/brier_reward": 0.8700548529624939, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947048544883728, "rewards/mean_confidence_reward": 0.6952019929885864, "signal/accuracy_reward/centered_abs_mean": 0.13249240517616273, "signal/accuracy_reward/group_std_mean": 0.17707535922527312, "signal/accuracy_reward/group_zero_std_frac": 0.48055556416511536, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06624620258808137, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06624620258808137, "signal/advantage_abs_mean": 0.08557261526584625, "signal/advantage_pre_scale_abs_mean": 0.08557261526584625, "signal/advantage_pre_scale_std": 0.16835099160671235, "signal/advantage_std": 0.16835099160671235, "signal/brier_reward/centered_abs_mean": 0.07813414037227631, "signal/brier_reward/group_std_mean": 0.10621408671140671, "signal/brier_reward/group_zero_std_frac": 0.06944444626569748, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.039067070186138156, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.039067070186138156, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.009781901072710753, "signal/format_reward/group_std_mean": 0.02228541262447834, "signal/format_reward/group_zero_std_frac": 0.8944444537162781, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004890950536355377, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004890950536355377, "signal/mean_confidence_reward/centered_abs_mean": 0.07308160662651061, "signal/mean_confidence_reward/group_std_mean": 0.09511344730854035, "signal/mean_confidence_reward/group_zero_std_frac": 0.0722222238779068, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.308160434149614e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.308160434149614e-07, "step": 525 }, { "calibration/aurc": 0.11538239739786885, "calibration/batch_distribution_entropy": 0.7712741350899945, "calibration/confidence_entropy": 0.4136095947188873, "calibration/coverage@0%": 0.13875924717145344, "calibration/coverage@1%": 0.2543817601215658, "calibration/coverage@10%": 0.6712941207314358, "calibration/coverage@15%": 0.7528657083158286, "calibration/coverage@20%": 0.8137956744454087, "calibration/coverage@25%": 0.8450261780104711, "calibration/coverage@30%": 0.9104712041884817, "calibration/coverage@5%": 0.34036549597883303, "calibration/ece": 0.1136121768417238, "calibration/mean_confidence": 0.7103512041518616, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006944444444444442, "completions/max_length": 3784.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 934.8638061523437, "completions/mean_terminated_length": 941.44091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 261.8, "epoch": 1.2735965800427493, "grad_norm": 0.0003841710858978331, "learning_rate": 3.0649038461538464e-06, "loss": -0.0066, "num_tokens": 1315136906.0, "reward": 1.3033621549606322, "reward_std": 0.12401916682720185, "rewards/accuracy_reward": 0.7314236164093018, "rewards/brier_reward": 0.8822309255599976, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9930555582046509, "rewards/mean_confidence_reward": 0.7065008759498597, "signal/accuracy_reward/centered_abs_mean": 0.12860243171453475, "signal/accuracy_reward/group_std_mean": 0.17546527981758117, "signal/accuracy_reward/group_zero_std_frac": 0.4777777850627899, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06430121585726738, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06430121585726738, "signal/advantage_abs_mean": 0.08718660175800323, "signal/advantage_pre_scale_abs_mean": 0.08718660175800323, "signal/advantage_pre_scale_std": 0.1739441365003586, "signal/advantage_std": 0.1739441365003586, "signal/brier_reward/centered_abs_mean": 0.07954438626766205, "signal/brier_reward/group_std_mean": 0.10905488133430481, "signal/brier_reward/group_zero_std_frac": 0.12222222536802292, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03977219313383103, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03977219313383103, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.011631944309920072, "signal/format_reward/group_std_mean": 0.02209738679230213, "signal/format_reward/group_zero_std_frac": 0.9083333492279053, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005815972154960036, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005815972154960036, "signal/mean_confidence_reward/centered_abs_mean": 0.07923656105995178, "signal/mean_confidence_reward/group_std_mean": 0.10262601524591446, "signal/mean_confidence_reward/group_zero_std_frac": 0.1250000014901161, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.923655516606231e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.923655516606231e-07, "step": 530 }, { "calibration/aurc": 0.047759467154315845, "calibration/batch_distribution_entropy": 0.6663240429338483, "calibration/confidence_entropy": 0.36327989441421193, "calibration/coverage@0%": 0.2786051019636943, "calibration/coverage@1%": 0.4329326543197152, "calibration/coverage@10%": 0.8220726894849314, "calibration/coverage@15%": 0.8881215546137012, "calibration/coverage@20%": 0.9265298207706586, "calibration/coverage@25%": 0.9534031413612565, "calibration/coverage@30%": 0.9947643979057592, "calibration/coverage@5%": 0.7026888480808353, "calibration/ece": 0.11845875492044192, "calibration/mean_confidence": 0.7549263148968561, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010329861111111093, "completions/max_length": 3755.6, "completions/max_terminated_length": 3755.6, "completions/mean_length": 911.9911499023438, "completions/mean_terminated_length": 921.542333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 226.6, "epoch": 1.2855964300446243, "grad_norm": 0.00033254362642765045, "learning_rate": 3.0348557692307694e-06, "loss": -0.0097, "num_tokens": 1328711940.0, "reward": 1.298271369934082, "reward_std": 0.1155781701207161, "rewards/accuracy_reward": 0.746875011920929, "rewards/brier_reward": 0.859982693195343, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9896701455116272, "rewards/mean_confidence_reward": 0.7472734332084656, "signal/accuracy_reward/centered_abs_mean": 0.10274522602558137, "signal/accuracy_reward/group_std_mean": 0.14268279671669007, "signal/accuracy_reward/group_zero_std_frac": 0.569444453716278, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05137261301279068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05137261301279068, "signal/advantage_abs_mean": 0.08023017644882202, "signal/advantage_pre_scale_abs_mean": 0.08023017644882202, "signal/advantage_pre_scale_std": 0.17143599987030028, "signal/advantage_std": 0.17143599987030028, "signal/brier_reward/centered_abs_mean": 0.07835201472043991, "signal/brier_reward/group_std_mean": 0.10759939700365066, "signal/brier_reward/group_zero_std_frac": 0.18055555522441863, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03917600736021996, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03917600736021996, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01558702252805233, "signal/format_reward/group_std_mean": 0.027374763041734695, "signal/format_reward/group_zero_std_frac": 0.8916666746139527, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007793511264026165, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007793511264026165, "signal/mean_confidence_reward/centered_abs_mean": 0.06730367541313172, "signal/mean_confidence_reward/group_std_mean": 0.08901645392179489, "signal/mean_confidence_reward/group_zero_std_frac": 0.18333333134651184, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.730367203999776e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.730367203999776e-07, "step": 535 }, { "calibration/aurc": 0.10785256756308437, "calibration/batch_distribution_entropy": 0.7738383570962954, "calibration/confidence_entropy": 0.41636538135459567, "calibration/coverage@0%": 0.18239759625459775, "calibration/coverage@1%": 0.2844809295879311, "calibration/coverage@10%": 0.4832437683518448, "calibration/coverage@15%": 0.7355157885349313, "calibration/coverage@20%": 0.8030711055254747, "calibration/coverage@25%": 0.9126920861012657, "calibration/coverage@30%": 0.9142545861012656, "calibration/coverage@5%": 0.3713077187347193, "calibration/ece": 0.08519058560327294, "calibration/mean_confidence": 0.6837061314927783, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008767361111111116, "completions/max_length": 3699.6, "completions/max_terminated_length": 3699.6, "completions/mean_length": 971.444189453125, "completions/mean_terminated_length": 980.094970703125, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 1.2975962800464993, "grad_norm": 0.0003074088890571147, "learning_rate": 3.0048076923076923e-06, "loss": -0.007, "num_tokens": 1343009473.0, "reward": 1.3159483194351196, "reward_std": 0.12239857465028763, "rewards/accuracy_reward": 0.7714409589767456, "rewards/brier_reward": 0.8692085862159729, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9912326455116272, "rewards/mean_confidence_reward": 0.7209221601486206, "signal/accuracy_reward/centered_abs_mean": 0.11985134333372116, "signal/accuracy_reward/group_std_mean": 0.159143128991127, "signal/accuracy_reward/group_zero_std_frac": 0.5416666626930237, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05992567166686058, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05992567166686058, "signal/advantage_abs_mean": 0.08536248356103897, "signal/advantage_pre_scale_abs_mean": 0.08536248356103897, "signal/advantage_pre_scale_std": 0.17411383986473083, "signal/advantage_std": 0.17411383986473083, "signal/brier_reward/centered_abs_mean": 0.08521012216806412, "signal/brier_reward/group_std_mean": 0.11651065647602081, "signal/brier_reward/group_zero_std_frac": 0.16666666865348817, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04260506108403206, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04260506108403206, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.014892578311264514, "signal/format_reward/group_std_mean": 0.030052604153752327, "signal/format_reward/group_zero_std_frac": 0.8666666746139526, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007446289155632257, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007446289155632257, "signal/mean_confidence_reward/centered_abs_mean": 0.07823951542377472, "signal/mean_confidence_reward/group_std_mean": 0.10285495817661286, "signal/mean_confidence_reward/group_zero_std_frac": 0.18055555522441863, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.823951136742835e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.823951136742835e-07, "step": 540 }, { "calibration/aurc": 0.14532476673563677, "calibration/batch_distribution_entropy": 0.6419874850050629, "calibration/confidence_entropy": 0.3578615119935988, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.503082971933254, "calibration/coverage@15%": 0.5255141620986152, "calibration/coverage@20%": 0.772816380936071, "calibration/coverage@25%": 0.9069426265270506, "calibration/coverage@30%": 0.9403468586387435, "calibration/coverage@5%": 0.06822916666666666, "calibration/ece": 0.10405146674222407, "calibration/mean_confidence": 0.7830606406726042, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008854166666666673, "completions/max_length": 3923.8, "completions/max_terminated_length": 3923.8, "completions/mean_length": 980.4290771484375, "completions/mean_terminated_length": 989.1726440429687, "completions/min_length": 0.0, "completions/min_terminated_length": 270.8, "epoch": 1.3095961300483743, "grad_norm": 0.0003773860225919634, "learning_rate": 2.974759615384616e-06, "loss": -0.0086, "num_tokens": 1357348816.0, "reward": 1.2867057085037232, "reward_std": 0.13529545664787293, "rewards/accuracy_reward": 0.7306423544883728, "rewards/brier_reward": 0.8516080141067505, "rewards/confidence_one_or_zero": 0.0003472222248092294, "rewards/format_reward": 0.9911458373069764, "rewards/mean_confidence_reward": 0.7627899408340454, "signal/accuracy_reward/centered_abs_mean": 0.12644856721162795, "signal/accuracy_reward/group_std_mean": 0.1708012819290161, "signal/accuracy_reward/group_zero_std_frac": 0.5000000119209289, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06322428360581397, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06322428360581397, "signal/advantage_abs_mean": 0.09366002082824706, "signal/advantage_pre_scale_abs_mean": 0.09366002082824706, "signal/advantage_pre_scale_std": 0.18919834792613982, "signal/advantage_std": 0.18919834792613982, "signal/brier_reward/centered_abs_mean": 0.08626611977815628, "signal/brier_reward/group_std_mean": 0.11912181079387665, "signal/brier_reward/group_zero_std_frac": 0.19722222536802292, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04313305988907814, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04313305988907814, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006618923507630825, "signal/confidence_one_or_zero/group_std_mean": 0.0016652445774525404, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.618923364953844e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.618923364953844e-09, "signal/format_reward/centered_abs_mean": 0.015722656063735484, "signal/format_reward/group_std_mean": 0.031360096856951715, "signal/format_reward/group_zero_std_frac": 0.8666666626930237, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007861328031867742, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007861328031867742, "signal/mean_confidence_reward/centered_abs_mean": 0.07467889338731766, "signal/mean_confidence_reward/group_std_mean": 0.09980264008045196, "signal/mean_confidence_reward/group_zero_std_frac": 0.20277778208255767, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.467888963219593e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.467888963219593e-07, "step": 545 }, { "calibration/aurc": 0.10963906850641261, "calibration/batch_distribution_entropy": 0.5600195974257417, "calibration/confidence_entropy": 0.3221848751709251, "calibration/coverage@0%": 0.150825613588605, "calibration/coverage@1%": 0.150825613588605, "calibration/coverage@10%": 0.5307777881560866, "calibration/coverage@15%": 0.7710659485029022, "calibration/coverage@20%": 0.9097624563745438, "calibration/coverage@25%": 0.964913060385534, "calibration/coverage@30%": 0.9659574468085106, "calibration/coverage@5%": 0.150825613588605, "calibration/ece": 0.09524587920736664, "calibration/mean_confidence": 0.8138321187372866, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009982638888888885, "completions/max_length": 3978.4, "completions/max_terminated_length": 3978.4, "completions/mean_length": 982.8621459960938, "completions/mean_terminated_length": 993.000732421875, "completions/min_length": 0.0, "completions/min_terminated_length": 255.2, "epoch": 1.3215959800502493, "grad_norm": 0.0003739887906704098, "learning_rate": 2.9447115384615386e-06, "loss": -0.0129, "num_tokens": 1371741788.0, "reward": 1.2999849319458008, "reward_std": 0.13053077906370164, "rewards/accuracy_reward": 0.7560763955116272, "rewards/brier_reward": 0.8538602232933045, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9900173664093017, "rewards/mean_confidence_reward": 0.7924123406410217, "signal/accuracy_reward/centered_abs_mean": 0.11145833432674408, "signal/accuracy_reward/group_std_mean": 0.15393095463514328, "signal/accuracy_reward/group_zero_std_frac": 0.5333333492279053, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05572916716337204, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05572916716337204, "signal/advantage_abs_mean": 0.09160781055688857, "signal/advantage_pre_scale_abs_mean": 0.09160781055688857, "signal/advantage_pre_scale_std": 0.1891625076532364, "signal/advantage_std": 0.1891625076532364, "signal/brier_reward/centered_abs_mean": 0.0860573947429657, "signal/brier_reward/group_std_mean": 0.11696600019931794, "signal/brier_reward/group_zero_std_frac": 0.1944444477558136, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04302869737148285, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04302869737148285, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.017008463479578496, "signal/format_reward/group_std_mean": 0.030541813001036644, "signal/format_reward/group_zero_std_frac": 0.8805555701255798, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.008504231739789248, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.008504231739789248, "signal/mean_confidence_reward/centered_abs_mean": 0.06817357018589973, "signal/mean_confidence_reward/group_std_mean": 0.09106780588626862, "signal/mean_confidence_reward/group_zero_std_frac": 0.2083333373069763, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.817357188992901e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.817357188992901e-07, "step": 550 }, { "epoch": 1.3215959800502493, "eval_calibration/aurc": 0.1459489788696482, "eval_calibration/batch_distribution_entropy": 0.5430151783081066, "eval_calibration/confidence_entropy": 0.33289478353000584, "eval_calibration/coverage@0%": 0.0, "eval_calibration/coverage@1%": 0.0, "eval_calibration/coverage@10%": 0.3551747311827957, "eval_calibration/coverage@15%": 0.5554435483870968, "eval_calibration/coverage@20%": 0.8388216845878137, "eval_calibration/coverage@25%": 0.9035394265232974, "eval_calibration/coverage@30%": 0.9946236559139785, "eval_calibration/coverage@5%": 0.21169354838709675, "eval_calibration/ece": 0.16260080645161284, "eval_calibration/mean_confidence": 0.8019466845878136, "eval_completions/clipped_ratio": 0.015625000000000017, "eval_completions/max_length": 3464.8333333333335, "eval_completions/max_terminated_length": 3464.8333333333335, "eval_completions/mean_length": 951.442372639974, "eval_completions/mean_terminated_length": 966.6598917643229, "eval_completions/min_length": 60.333333333333336, "eval_completions/min_terminated_length": 288.3333333333333, "eval_loss": 0.0, "eval_num_tokens": 1371741788.0, "eval_reward": 1.2755183180173237, "eval_reward_std": 0.35252896944681805, "eval_rewards/accuracy_reward": 0.722222218910853, "eval_rewards/brier_reward": 0.8461598853270212, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9826388955116272, "eval_rewards/mean_confidence_reward": 0.7744444410006205, "eval_runtime": 215.7812, "eval_samples_per_second": 4.634, "eval_signal/accuracy_reward/centered_abs_mean": 0.3891059011220932, "eval_signal/accuracy_reward/group_std_mean": 0.44586536784966785, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1945529505610466, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.1945529505610466, "eval_signal/advantage_abs_mean": 0.2868380695581436, "eval_signal/advantage_pre_scale_abs_mean": 0.2868380695581436, "eval_signal/advantage_pre_scale_std": 0.35141922533512115, "eval_signal/advantage_std": 0.35141922533512115, "eval_signal/brier_reward/centered_abs_mean": 0.20076740781466165, "eval_signal/brier_reward/group_std_mean": 0.275022029876709, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.10038370390733083, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.10038370390733083, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.03331163137530287, "eval_signal/format_reward/group_std_mean": 0.08924104925245047, "eval_signal/format_reward/group_zero_std_frac": 0.5277777910232544, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.016655815687651437, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.016655815687651437, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.21365723758935928, "eval_signal/mean_confidence_reward/group_std_mean": 0.264442707101504, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.1365722583747506e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.1365722583747506e-06, "eval_steps_per_second": 0.028, "step": 550 }, { "epoch": 1.3215959800502493, "step": 550, "train_probe_calibration/aurc": 0.111951449400212, "train_probe_calibration/batch_distribution_entropy": 0.5783584476939697, "train_probe_calibration/confidence_entropy": 0.3550639866908134, "train_probe_calibration/coverage@0%": 0.08854166666666667, "train_probe_calibration/coverage@1%": 0.08854166666666667, "train_probe_calibration/coverage@10%": 0.5132728494623656, "train_probe_calibration/coverage@15%": 0.7172379032258065, "train_probe_calibration/coverage@20%": 0.9005376344086021, "train_probe_calibration/coverage@25%": 0.9422043010752689, "train_probe_calibration/coverage@30%": 0.9739583333333334, "train_probe_calibration/coverage@5%": 0.13020833333333334, "train_probe_calibration/ece": 0.1219422043010752, "train_probe_calibration/mean_confidence": 0.7878360215053765, "train_probe_completions/clipped_ratio": 0.006944444444444438, "train_probe_completions/max_length": 3380.8333333333335, "train_probe_completions/max_terminated_length": 3380.8333333333335, "train_probe_completions/mean_length": 1007.7863159179688, "train_probe_completions/mean_terminated_length": 1014.6436360677084, "train_probe_completions/min_length": 126.66666666666667, "train_probe_completions/min_terminated_length": 251.83333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 1371741788.0, "train_probe_reward": 1.3095344305038452, "train_probe_reward_std": 0.3171052138010661, "train_probe_rewards/accuracy_reward": 0.7586805522441864, "train_probe_rewards/brier_reward": 0.871657262245814, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9887152711550394, "train_probe_rewards/mean_confidence_reward": 0.7769299944241842, "train_probe_runtime": 200.8501, "train_probe_samples_per_second": 4.979, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3587239583333333, "train_probe_signal/accuracy_reward/group_std_mean": 0.42867589990297955, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.17936197916666666, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.17936197916666666, "train_probe_signal/advantage_abs_mean": 0.25219326714674634, "train_probe_signal/advantage_pre_scale_abs_mean": 0.25219326714674634, "train_probe_signal/advantage_pre_scale_std": 0.31636855006217957, "train_probe_signal/advantage_std": 0.31636855006217957, "train_probe_signal/brier_reward/centered_abs_mean": 0.1677782485882441, "train_probe_signal/brier_reward/group_std_mean": 0.23706789314746857, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08388912429412206, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.08388912429412206, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.021755642257630825, "train_probe_signal/format_reward/group_std_mean": 0.06084662117063999, "train_probe_signal/format_reward/group_zero_std_frac": 0.6666666716337204, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.010877821128815413, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.010877821128815413, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.21034109344085059, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.2607882668574651, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.1034107362538634e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.1034107362538634e-06, "train_probe_steps_per_second": 0.03 }, { "calibration/aurc": 0.16387994620194077, "calibration/batch_distribution_entropy": 0.6666863537673254, "calibration/confidence_entropy": 0.3869006174238223, "calibration/coverage@0%": 0.002091514017666852, "calibration/coverage@1%": 0.4092487133625268, "calibration/coverage@10%": 0.5435385461041639, "calibration/coverage@15%": 0.5571155696028584, "calibration/coverage@20%": 0.5986780266593377, "calibration/coverage@25%": 0.6339411845540746, "calibration/coverage@30%": 0.6942778617562182, "calibration/coverage@5%": 0.5084337305507555, "calibration/ece": 0.13502381046673145, "calibration/mean_confidence": 0.7431247212931693, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012673611111111116, "completions/max_length": 3867.0, "completions/max_terminated_length": 3867.0, "completions/mean_length": 998.92041015625, "completions/mean_terminated_length": 1011.8125732421875, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 1.3335958300521242, "grad_norm": 0.0003139972104690969, "learning_rate": 2.9146634615384615e-06, "loss": -0.0151, "num_tokens": 1386353991.0, "reward": 1.2860611438751222, "reward_std": 0.12302568852901459, "rewards/accuracy_reward": 0.7322916626930237, "rewards/brier_reward": 0.8524889469146728, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9873263835906982, "rewards/mean_confidence_reward": 0.7607763290405274, "signal/accuracy_reward/centered_abs_mean": 0.10929904580116272, "signal/accuracy_reward/group_std_mean": 0.1468104600906372, "signal/accuracy_reward/group_zero_std_frac": 0.5611111164093018, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05464952290058136, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05464952290058136, "signal/advantage_abs_mean": 0.08562225699424744, "signal/advantage_pre_scale_abs_mean": 0.08562225699424744, "signal/advantage_pre_scale_std": 0.18145413398742677, "signal/advantage_std": 0.18145413398742677, "signal/brier_reward/centered_abs_mean": 0.08113540261983872, "signal/brier_reward/group_std_mean": 0.11180048584938049, "signal/brier_reward/group_zero_std_frac": 0.18611111342906952, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04056770130991936, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04056770130991936, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01957465298473835, "signal/format_reward/group_std_mean": 0.03440774716436863, "signal/format_reward/group_zero_std_frac": 0.8666666626930237, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.009787326492369176, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009787326492369176, "signal/mean_confidence_reward/centered_abs_mean": 0.0696688748896122, "signal/mean_confidence_reward/group_std_mean": 0.09033358991146087, "signal/mean_confidence_reward/group_zero_std_frac": 0.22222222685813903, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.966887212911388e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.966887212911388e-07, "step": 555 }, { "calibration/aurc": 0.1313908098960462, "calibration/batch_distribution_entropy": 0.6123164479035214, "calibration/confidence_entropy": 0.34287934307447887, "calibration/coverage@0%": 0.0020944148936170213, "calibration/coverage@1%": 0.1531360815602837, "calibration/coverage@10%": 0.29741782239798004, "calibration/coverage@15%": 0.5160841744913225, "calibration/coverage@20%": 0.8339968399403898, "calibration/coverage@25%": 0.9339681994966827, "calibration/coverage@30%": 0.9629032258064516, "calibration/coverage@5%": 0.1828235815602837, "calibration/ece": 0.10925277468482926, "calibration/mean_confidence": 0.7790019482058517, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00894097222222221, "completions/max_length": 3972.0, "completions/max_terminated_length": 3972.0, "completions/mean_length": 1008.0288208007812, "completions/mean_terminated_length": 1017.2925415039062, "completions/min_length": 0.0, "completions/min_terminated_length": 258.8, "epoch": 1.3455956800539992, "grad_norm": 0.0003592314606066793, "learning_rate": 2.8846153846153845e-06, "loss": -0.0107, "num_tokens": 1401056179.0, "reward": 1.2946399927139283, "reward_std": 0.12469799071550369, "rewards/accuracy_reward": 0.7381944417953491, "rewards/brier_reward": 0.8600115299224853, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9910590291023255, "rewards/mean_confidence_reward": 0.7452022671699524, "signal/accuracy_reward/centered_abs_mean": 0.11985676884651184, "signal/accuracy_reward/group_std_mean": 0.15804824233055115, "signal/accuracy_reward/group_zero_std_frac": 0.5472222208976746, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05992838442325592, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05992838442325592, "signal/advantage_abs_mean": 0.09076006263494492, "signal/advantage_pre_scale_abs_mean": 0.09076006263494492, "signal/advantage_pre_scale_std": 0.18512835204601288, "signal/advantage_std": 0.18512835204601288, "signal/brier_reward/centered_abs_mean": 0.08587552607059479, "signal/brier_reward/group_std_mean": 0.11345891803503036, "signal/brier_reward/group_zero_std_frac": 0.21388888955116273, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.042937763035297394, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.042937763035297394, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.015662977658212184, "signal/format_reward/group_std_mean": 0.0296281848102808, "signal/format_reward/group_zero_std_frac": 0.8777777791023255, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007831488829106092, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007831488829106092, "signal/mean_confidence_reward/centered_abs_mean": 0.06804640367627143, "signal/mean_confidence_reward/group_std_mean": 0.08904693871736527, "signal/mean_confidence_reward/group_zero_std_frac": 0.23055555820465087, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.804640293012199e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.804640293012199e-07, "step": 560 }, { "calibration/aurc": 0.11484866586071156, "calibration/batch_distribution_entropy": 0.5421177077904118, "calibration/confidence_entropy": 0.3175850262001279, "calibration/coverage@0%": 0.1101827676240209, "calibration/coverage@1%": 0.2882506527415144, "calibration/coverage@10%": 0.49560407007584895, "calibration/coverage@15%": 0.7172596345393838, "calibration/coverage@20%": 0.7905013192612137, "calibration/coverage@25%": 0.8, "calibration/coverage@30%": 0.8, "calibration/coverage@5%": 0.44034390349759234, "calibration/ece": 0.12121333571602591, "calibration/mean_confidence": 0.8234524208976792, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008593749999999978, "completions/max_length": 3968.6, "completions/max_terminated_length": 3968.6, "completions/mean_length": 979.8955932617188, "completions/mean_terminated_length": 988.489404296875, "completions/min_length": 0.0, "completions/min_terminated_length": 268.8, "epoch": 1.3575955300558742, "grad_norm": 0.0003435547405388206, "learning_rate": 2.8545673076923082e-06, "loss": -0.0084, "num_tokens": 1415423104.0, "reward": 1.3047271966934204, "reward_std": 0.11923027485609054, "rewards/accuracy_reward": 0.7565972328186035, "rewards/brier_reward": 0.8614351153373718, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.99140625, "rewards/mean_confidence_reward": 0.7901580929756165, "signal/accuracy_reward/centered_abs_mean": 0.10751953125, "signal/accuracy_reward/group_std_mean": 0.1448575958609581, "signal/accuracy_reward/group_zero_std_frac": 0.5722222328186035, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.053759765625, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.053759765625, "signal/advantage_abs_mean": 0.08398886173963546, "signal/advantage_pre_scale_abs_mean": 0.08398886173963546, "signal/advantage_pre_scale_std": 0.17855680584907532, "signal/advantage_std": 0.17855680584907532, "signal/brier_reward/centered_abs_mean": 0.07681927233934402, "signal/brier_reward/group_std_mean": 0.10601076185703277, "signal/brier_reward/group_zero_std_frac": 0.24722222089767457, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03840963616967201, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03840963616967201, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.01299370639026165, "signal/format_reward/group_std_mean": 0.025268582440912724, "signal/format_reward/group_zero_std_frac": 0.8916666746139527, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006496853195130825, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006496853195130825, "signal/mean_confidence_reward/centered_abs_mean": 0.05918462947010994, "signal/mean_confidence_reward/group_std_mean": 0.07994452565908432, "signal/mean_confidence_reward/group_zero_std_frac": 0.2722222238779068, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.918462647969136e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.918462647969136e-07, "step": 565 }, { "calibration/aurc": 0.11576167311042562, "calibration/batch_distribution_entropy": 0.6175232344638651, "calibration/confidence_entropy": 0.3462862535596102, "calibration/coverage@0%": 0.006807743065416943, "calibration/coverage@1%": 0.006807743065416943, "calibration/coverage@10%": 0.5329011700133417, "calibration/coverage@15%": 0.7625725723629395, "calibration/coverage@20%": 0.8546087030649405, "calibration/coverage@25%": 0.9278845096745822, "calibration/coverage@30%": 0.9816952506596305, "calibration/coverage@5%": 0.24855718013458494, "calibration/ece": 0.10048178034394602, "calibration/mean_confidence": 0.7754272520157706, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007291666666666652, "completions/max_length": 3888.6, "completions/max_terminated_length": 3888.6, "completions/mean_length": 999.9989624023438, "completions/mean_terminated_length": 1007.5173583984375, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 1.3695953800577492, "grad_norm": 0.0003193428856320679, "learning_rate": 2.8245192307692307e-06, "loss": -0.0084, "num_tokens": 1430026964.0, "reward": 1.2861733198165894, "reward_std": 0.11641044020652772, "rewards/accuracy_reward": 0.7322048664093017, "rewards/brier_reward": 0.8476781368255615, "rewards/confidence_one_or_zero": 0.0005208333430346101, "rewards/format_reward": 0.9924479126930237, "rewards/mean_confidence_reward": 0.7810012936592102, "signal/accuracy_reward/centered_abs_mean": 0.10400933176279067, "signal/accuracy_reward/group_std_mean": 0.14505575895309447, "signal/accuracy_reward/group_zero_std_frac": 0.5583333313465119, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05200466588139534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05200466588139534, "signal/advantage_abs_mean": 0.07831602916121483, "signal/advantage_pre_scale_abs_mean": 0.07831602916121483, "signal/advantage_pre_scale_std": 0.1718251585960388, "signal/advantage_std": 0.1718251585960388, "signal/brier_reward/centered_abs_mean": 0.0742592304944992, "signal/brier_reward/group_std_mean": 0.10601829439401626, "signal/brier_reward/group_zero_std_frac": 0.2416666716337204, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0371296152472496, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0371296152472496, "signal/confidence_one_or_zero/centered_abs_mean": 0.000998263864312321, "signal/confidence_one_or_zero/group_std_mean": 0.0026473373174667357, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9861111044883728, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 9.982638005112676e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 9.982638005112676e-09, "signal/format_reward/centered_abs_mean": 0.013026258442550898, "signal/format_reward/group_std_mean": 0.025032466650009154, "signal/format_reward/group_zero_std_frac": 0.8972222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006513129221275449, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006513129221275449, "signal/mean_confidence_reward/centered_abs_mean": 0.06129540354013443, "signal/mean_confidence_reward/group_std_mean": 0.08391042798757553, "signal/mean_confidence_reward/group_zero_std_frac": 0.2638888955116272, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.129540338406514e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.129540338406514e-07, "step": 570 }, { "calibration/aurc": 0.10830923976789804, "calibration/batch_distribution_entropy": 0.6550796725826317, "calibration/confidence_entropy": 0.3696787351188028, "calibration/coverage@0%": 0.016155352480417752, "calibration/coverage@1%": 0.1226827676240209, "calibration/coverage@10%": 0.6061466492602262, "calibration/coverage@15%": 0.6561752067014794, "calibration/coverage@20%": 0.8002257397737162, "calibration/coverage@25%": 0.8711229873803308, "calibration/coverage@30%": 0.9154577349869453, "calibration/coverage@5%": 0.4909948868581376, "calibration/ece": 0.10065406059617063, "calibration/mean_confidence": 0.7755118173411663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004600694444444419, "completions/max_length": 3914.2, "completions/max_terminated_length": 3914.2, "completions/mean_length": 1015.513818359375, "completions/mean_terminated_length": 1020.210498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 292.2, "epoch": 1.3815952300596241, "grad_norm": 0.00039001504774205387, "learning_rate": 2.7944711538461537e-06, "loss": -0.0039, "num_tokens": 1444799059.0, "reward": 1.297674012184143, "reward_std": 0.13122620582580566, "rewards/accuracy_reward": 0.741406238079071, "rewards/brier_reward": 0.8584399223327637, "rewards/confidence_one_or_zero": 0.00034722223062999547, "rewards/format_reward": 0.9954861164093017, "rewards/mean_confidence_reward": 0.7870616316795349, "signal/accuracy_reward/centered_abs_mean": 0.11826714426279068, "signal/accuracy_reward/group_std_mean": 0.15956346690654755, "signal/accuracy_reward/group_zero_std_frac": 0.5277777969837188, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05913357213139534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05913357213139534, "signal/advantage_abs_mean": 0.09634890556335449, "signal/advantage_pre_scale_abs_mean": 0.09634890556335449, "signal/advantage_pre_scale_std": 0.19221472442150117, "signal/advantage_std": 0.19221472442150117, "signal/brier_reward/centered_abs_mean": 0.09139910340309143, "signal/brier_reward/group_std_mean": 0.12151508778333664, "signal/brier_reward/group_zero_std_frac": 0.18333333134651184, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.045699551701545715, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.045699551701545715, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006727430387400091, "signal/confidence_one_or_zero/group_std_mean": 0.0019641853868961334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9888888835906983, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/format_reward/centered_abs_mean": 0.008148871548473834, "signal/format_reward/group_std_mean": 0.016635999642312528, "signal/format_reward/group_zero_std_frac": 0.9277777671813965, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004074435774236917, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004074435774236917, "signal/mean_confidence_reward/centered_abs_mean": 0.06603506281971931, "signal/mean_confidence_reward/group_std_mean": 0.08760364055633545, "signal/mean_confidence_reward/group_zero_std_frac": 0.20555555820465088, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.603505539715116e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.603505539715116e-07, "step": 575 }, { "calibration/aurc": 0.07146554733335315, "calibration/batch_distribution_entropy": 0.616077207414069, "calibration/confidence_entropy": 0.35214398196662333, "calibration/coverage@0%": 0.11989120131599204, "calibration/coverage@1%": 0.23094383289493942, "calibration/coverage@10%": 0.7576039872369753, "calibration/coverage@15%": 0.8067421235175563, "calibration/coverage@20%": 0.8895774193181494, "calibration/coverage@25%": 0.9633532753575516, "calibration/coverage@30%": 0.9937062697450024, "calibration/coverage@5%": 0.45990153474117806, "calibration/ece": 0.0911636164010684, "calibration/mean_confidence": 0.7995762769942684, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003645833333333326, "completions/max_length": 3807.8, "completions/max_terminated_length": 3807.8, "completions/mean_length": 987.5064086914062, "completions/mean_terminated_length": 991.0719604492188, "completions/min_length": 0.0, "completions/min_terminated_length": 315.2, "epoch": 1.3935950800614991, "grad_norm": 0.0003452130767982453, "learning_rate": 2.7644230769230775e-06, "loss": -0.0023, "num_tokens": 1459266941.0, "reward": 1.29806067943573, "reward_std": 0.12137905955314636, "rewards/accuracy_reward": 0.7465277791023255, "rewards/brier_reward": 0.8533100962638855, "rewards/confidence_one_or_zero": 0.0003472222248092294, "rewards/format_reward": 0.9962673425674439, "rewards/mean_confidence_reward": 0.7997212171554565, "signal/accuracy_reward/centered_abs_mean": 0.10619574785232544, "signal/accuracy_reward/group_std_mean": 0.15078104436397552, "signal/accuracy_reward/group_zero_std_frac": 0.522222226858139, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05309787392616272, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05309787392616272, "signal/advantage_abs_mean": 0.08400989323854446, "signal/advantage_pre_scale_abs_mean": 0.08400989323854446, "signal/advantage_pre_scale_std": 0.17453961968421935, "signal/advantage_std": 0.17453961968421935, "signal/brier_reward/centered_abs_mean": 0.0801599383354187, "signal/brier_reward/group_std_mean": 0.1125416949391365, "signal/brier_reward/group_zero_std_frac": 0.18611111044883727, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04007996916770935, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04007996916770935, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006727430503815412, "signal/confidence_one_or_zero/group_std_mean": 0.0019641853868961334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9888888835906983, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/format_reward/centered_abs_mean": 0.006971571315079928, "signal/format_reward/group_std_mean": 0.01620498076081276, "signal/format_reward/group_zero_std_frac": 0.9222222447395325, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.003485785657539964, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.003485785657539964, "signal/mean_confidence_reward/centered_abs_mean": 0.0635561004281044, "signal/mean_confidence_reward/group_std_mean": 0.08486722409725189, "signal/mean_confidence_reward/group_zero_std_frac": 0.2194444477558136, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.355609912134241e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.355609912134241e-07, "step": 580 }, { "calibration/aurc": 0.19072638284676796, "calibration/batch_distribution_entropy": 0.7153298707020193, "calibration/confidence_entropy": 0.388062457595125, "calibration/coverage@0%": 0.0031331592689295036, "calibration/coverage@1%": 0.13334149260226283, "calibration/coverage@10%": 0.38560840948651004, "calibration/coverage@15%": 0.4398634682332463, "calibration/coverage@20%": 0.4628168516100957, "calibration/coverage@25%": 0.5081783072236727, "calibration/coverage@30%": 0.7239528938207137, "calibration/coverage@5%": 0.36737924281984335, "calibration/ece": 0.16538362162750217, "calibration/mean_confidence": 0.7703153829416884, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005381944444444442, "completions/max_length": 3956.2, "completions/max_terminated_length": 3956.2, "completions/mean_length": 1070.244970703125, "completions/mean_terminated_length": 1076.0716552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 1.405594930063374, "grad_norm": 0.00036830996396020055, "learning_rate": 2.7343750000000004e-06, "loss": -0.0037, "num_tokens": 1474722083.0, "reward": 1.2578930139541626, "reward_std": 0.13912229090929032, "rewards/accuracy_reward": 0.6855902791023254, "rewards/brier_reward": 0.8355623245239258, "rewards/confidence_one_or_zero": 0.000260416668606922, "rewards/format_reward": 0.994618046283722, "rewards/mean_confidence_reward": 0.7711336612701416, "signal/accuracy_reward/centered_abs_mean": 0.1353841170668602, "signal/accuracy_reward/group_std_mean": 0.1797961950302124, "signal/accuracy_reward/group_zero_std_frac": 0.4833333373069763, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0676920585334301, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0676920585334301, "signal/advantage_abs_mean": 0.1005555510520935, "signal/advantage_pre_scale_abs_mean": 0.1005555510520935, "signal/advantage_pre_scale_std": 0.1904763787984848, "signal/advantage_std": 0.1904763787984848, "signal/brier_reward/centered_abs_mean": 0.09052570164203644, "signal/brier_reward/group_std_mean": 0.12332021445035934, "signal/brier_reward/group_zero_std_frac": 0.1972222238779068, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04526285082101822, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04526285082101822, "signal/confidence_one_or_zero/centered_abs_mean": 0.0005045572877861559, "signal/confidence_one_or_zero/group_std_mean": 0.0014731390401721, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 5.0455724931453e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 5.0455724931453e-09, "signal/format_reward/centered_abs_mean": 0.00926649325992912, "signal/format_reward/group_std_mean": 0.018176876381039618, "signal/format_reward/group_zero_std_frac": 0.9222222208976746, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00463324662996456, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00463324662996456, "signal/mean_confidence_reward/centered_abs_mean": 0.0691556192934513, "signal/mean_confidence_reward/group_std_mean": 0.09173129945993423, "signal/mean_confidence_reward/group_zero_std_frac": 0.20833333432674409, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.91556203946675e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.91556203946675e-07, "step": 585 }, { "calibration/aurc": 0.07597801419650824, "calibration/batch_distribution_entropy": 0.6147919112981574, "calibration/confidence_entropy": 0.35561690406075736, "calibration/coverage@0%": 0.11145833333333333, "calibration/coverage@1%": 0.24561167319408178, "calibration/coverage@10%": 0.6051920147954742, "calibration/coverage@15%": 0.853125, "calibration/coverage@20%": 0.9270833333333334, "calibration/coverage@25%": 0.9744791666666668, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.5613848999129678, "calibration/ece": 0.08992052872062667, "calibration/mean_confidence": 0.7937222856832027, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0019097222222222321, "completions/max_length": 3789.8, "completions/max_terminated_length": 3789.8, "completions/mean_length": 980.2046020507812, "completions/mean_terminated_length": 982.120654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 272.6, "epoch": 1.417594780065249, "grad_norm": 0.000314175384119153, "learning_rate": 2.7043269230769233e-06, "loss": -0.0007, "num_tokens": 1489137496.0, "reward": 1.2709884881973266, "reward_std": 0.10083374679088593, "rewards/accuracy_reward": 0.7010416626930237, "rewards/brier_reward": 0.8428294658660889, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9980902671813965, "rewards/mean_confidence_reward": 0.7752277255058289, "signal/accuracy_reward/centered_abs_mean": 0.09595268964767456, "signal/accuracy_reward/group_std_mean": 0.1302931472659111, "signal/accuracy_reward/group_zero_std_frac": 0.6166666626930237, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04797634482383728, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04797634482383728, "signal/advantage_abs_mean": 0.0731815330684185, "signal/advantage_pre_scale_abs_mean": 0.0731815330684185, "signal/advantage_pre_scale_std": 0.15592477917671205, "signal/advantage_std": 0.15592477917671205, "signal/brier_reward/centered_abs_mean": 0.07182166054844856, "signal/brier_reward/group_std_mean": 0.0960231676697731, "signal/brier_reward/group_zero_std_frac": 0.2500000029802322, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03591083027422428, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03591083027422428, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.003526475699618459, "signal/format_reward/group_std_mean": 0.007825178280472755, "signal/format_reward/group_zero_std_frac": 0.9638888835906982, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0017632378498092295, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0017632378498092295, "signal/mean_confidence_reward/centered_abs_mean": 0.05661746188998222, "signal/mean_confidence_reward/group_std_mean": 0.07510910034179688, "signal/mean_confidence_reward/group_zero_std_frac": 0.2694444447755814, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.661746286023117e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.661746286023117e-07, "step": 590 }, { "calibration/aurc": 0.09888623628012375, "calibration/batch_distribution_entropy": 0.7111550322017844, "calibration/confidence_entropy": 0.38815687724442066, "calibration/coverage@0%": 0.084375, "calibration/coverage@1%": 0.20801920147954744, "calibration/coverage@10%": 0.5400633703220191, "calibration/coverage@15%": 0.8164157940663175, "calibration/coverage@20%": 0.8685700261780104, "calibration/coverage@25%": 0.9155950043630018, "calibration/coverage@30%": 0.9578125, "calibration/coverage@5%": 0.37801620974760664, "calibration/ece": 0.12361557689876017, "calibration/mean_confidence": 0.7504816565252507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001996527777777768, "completions/max_length": 3594.4, "completions/max_terminated_length": 3594.4, "completions/mean_length": 978.8388916015625, "completions/mean_terminated_length": 980.8303833007812, "completions/min_length": 0.0, "completions/min_terminated_length": 274.4, "epoch": 1.429594630067124, "grad_norm": 0.0003591340209823102, "learning_rate": 2.6742788461538467e-06, "loss": -0.0007, "num_tokens": 1503510936.0, "reward": 1.2893666982650758, "reward_std": 0.10826914757490158, "rewards/accuracy_reward": 0.7210069537162781, "rewards/brier_reward": 0.8597081065177917, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9980034708976746, "rewards/mean_confidence_reward": 0.7405732035636902, "signal/accuracy_reward/centered_abs_mean": 0.10771484225988388, "signal/accuracy_reward/group_std_mean": 0.1526619553565979, "signal/accuracy_reward/group_zero_std_frac": 0.5305555760860443, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05385742112994194, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05385742112994194, "signal/advantage_abs_mean": 0.07624872028827667, "signal/advantage_pre_scale_abs_mean": 0.07624872028827667, "signal/advantage_pre_scale_std": 0.15728772878646852, "signal/advantage_std": 0.15728772878646852, "signal/brier_reward/centered_abs_mean": 0.0736357480287552, "signal/brier_reward/group_std_mean": 0.09937446862459183, "signal/brier_reward/group_zero_std_frac": 0.18888889253139496, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0368178740143776, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0368178740143776, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003255208255723119, "signal/confidence_one_or_zero/group_std_mean": 0.0006831518840044737, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.255208369523643e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.255208369523643e-09, "signal/format_reward/centered_abs_mean": 0.003738064179196954, "signal/format_reward/group_std_mean": 0.008715906366705894, "signal/format_reward/group_zero_std_frac": 0.9583333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001869032089598477, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001869032089598477, "signal/mean_confidence_reward/centered_abs_mean": 0.06295873150229454, "signal/mean_confidence_reward/group_std_mean": 0.08315386176109314, "signal/mean_confidence_reward/group_zero_std_frac": 0.19444444477558137, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.295872935879743e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.295872935879743e-07, "step": 595 }, { "calibration/aurc": 0.09223223934458172, "calibration/batch_distribution_entropy": 0.6230658684028573, "calibration/confidence_entropy": 0.3590459996013345, "calibration/coverage@0%": 0.19503372497824195, "calibration/coverage@1%": 0.3367003916449086, "calibration/coverage@10%": 0.6493526979982593, "calibration/coverage@15%": 0.780725087032202, "calibration/coverage@20%": 0.8, "calibration/coverage@25%": 0.8, "calibration/coverage@30%": 0.8776041666666667, "calibration/coverage@5%": 0.6227602806788511, "calibration/ece": 0.13438037151871202, "calibration/mean_confidence": 0.8042409840078328, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011284722222222542, "completions/max_length": 3431.8, "completions/max_terminated_length": 3431.8, "completions/mean_length": 974.4915893554687, "completions/mean_terminated_length": 975.5936279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 290.4, "epoch": 1.441594480068999, "grad_norm": 0.00034910583053715527, "learning_rate": 2.6442307692307696e-06, "loss": 0.0005, "num_tokens": 1517831863.0, "reward": 1.3226205348968505, "reward_std": 0.08774760514497756, "rewards/accuracy_reward": 0.7693576455116272, "rewards/brier_reward": 0.8769970893859863, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9988715291023255, "rewards/mean_confidence_reward": 0.7460156440734863, "signal/accuracy_reward/centered_abs_mean": 0.09129231721162796, "signal/accuracy_reward/group_std_mean": 0.12743524760007857, "signal/accuracy_reward/group_zero_std_frac": 0.6083333373069764, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04564615860581398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04564615860581398, "signal/advantage_abs_mean": 0.0622914120554924, "signal/advantage_pre_scale_abs_mean": 0.0622914120554924, "signal/advantage_pre_scale_std": 0.13646405190229416, "signal/advantage_std": 0.13646405190229416, "signal/brier_reward/centered_abs_mean": 0.06282149329781532, "signal/brier_reward/group_std_mean": 0.0858630046248436, "signal/brier_reward/group_zero_std_frac": 0.1722222238779068, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03141074664890766, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03141074664890766, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0021321614389307798, "signal/format_reward/group_std_mean": 0.005135205760598183, "signal/format_reward/group_zero_std_frac": 0.9749999880790711, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0010660807194653899, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0010660807194653899, "signal/mean_confidence_reward/centered_abs_mean": 0.059941692650318144, "signal/mean_confidence_reward/group_std_mean": 0.07988529056310653, "signal/mean_confidence_reward/group_zero_std_frac": 0.17777777910232545, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.994169555378903e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.994169555378903e-07, "step": 600 }, { "epoch": 1.441594480068999, "eval_calibration/aurc": 0.10451276394134634, "eval_calibration/batch_distribution_entropy": 0.6720469752138057, "eval_calibration/confidence_entropy": 0.3814216470331801, "eval_calibration/coverage@0%": 0.3020833333333333, "eval_calibration/coverage@1%": 0.3020833333333333, "eval_calibration/coverage@10%": 0.578125, "eval_calibration/coverage@15%": 0.7239583333333334, "eval_calibration/coverage@20%": 0.8072916666666666, "eval_calibration/coverage@25%": 0.8854166666666666, "eval_calibration/coverage@30%": 0.9583333333333334, "eval_calibration/coverage@5%": 0.375, "eval_calibration/ece": 0.15234375, "eval_calibration/mean_confidence": 0.7356770833333334, "eval_completions/clipped_ratio": 0.0008680555555555617, "eval_completions/max_length": 2499.6666666666665, "eval_completions/max_terminated_length": 2499.6666666666665, "eval_completions/mean_length": 959.6630452473959, "eval_completions/mean_terminated_length": 960.4922688802084, "eval_completions/min_length": 263.1666666666667, "eval_completions/min_terminated_length": 302.8333333333333, "eval_loss": 0.0, "eval_num_tokens": 1517831863.0, "eval_reward": 1.282625397046407, "eval_reward_std": 0.3021497180064519, "eval_rewards/accuracy_reward": 0.7126736144224802, "eval_rewards/brier_reward": 0.8542985022068024, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9982638855775198, "eval_rewards/mean_confidence_reward": 0.7335937718550364, "eval_runtime": 147.4267, "eval_samples_per_second": 6.783, "eval_signal/accuracy_reward/centered_abs_mean": 0.3942599842945735, "eval_signal/accuracy_reward/group_std_mean": 0.44937459131081897, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19712999214728674, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.19712999214728674, "eval_signal/advantage_abs_mean": 0.2553720424572627, "eval_signal/advantage_pre_scale_abs_mean": 0.2553720424572627, "eval_signal/advantage_pre_scale_std": 0.3003276487191518, "eval_signal/advantage_std": 0.3003276487191518, "eval_signal/brier_reward/centered_abs_mean": 0.17614439874887466, "eval_signal/brier_reward/group_std_mean": 0.23443047205607095, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08807219937443733, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.08807219937443733, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.0033637151742974916, "eval_signal/format_reward/group_std_mean": 0.009820927555362383, "eval_signal/format_reward/group_zero_std_frac": 0.9444444477558136, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.23174425711234412, "eval_signal/mean_confidence_reward/group_std_mean": 0.2733881175518036, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.317442560221631e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.317442560221631e-06, "eval_steps_per_second": 0.041, "step": 600 }, { "epoch": 1.441594480068999, "step": 600, "train_probe_calibration/aurc": 0.11176829289641253, "train_probe_calibration/batch_distribution_entropy": 0.7015190869717917, "train_probe_calibration/confidence_entropy": 0.40564868456039765, "train_probe_calibration/coverage@0%": 0.22395833333333334, "train_probe_calibration/coverage@1%": 0.22395833333333334, "train_probe_calibration/coverage@10%": 0.5416666666666666, "train_probe_calibration/coverage@15%": 0.7083333333333334, "train_probe_calibration/coverage@20%": 0.84375, "train_probe_calibration/coverage@25%": 0.9322916666666666, "train_probe_calibration/coverage@30%": 0.9479166666666666, "train_probe_calibration/coverage@5%": 0.23958333333333334, "train_probe_calibration/ece": 0.1278645833333333, "train_probe_calibration/mean_confidence": 0.73203125, "train_probe_completions/clipped_ratio": 0.002604166666666685, "train_probe_completions/max_length": 3048.5, "train_probe_completions/max_terminated_length": 3048.5, "train_probe_completions/mean_length": 968.863047281901, "train_probe_completions/mean_terminated_length": 971.4267171223959, "train_probe_completions/min_length": 149.0, "train_probe_completions/min_terminated_length": 301.8333333333333, "train_probe_loss": 0.0, "train_probe_num_tokens": 1517831863.0, "train_probe_reward": 1.3095930020014446, "train_probe_reward_std": 0.2849563310543696, "train_probe_rewards/accuracy_reward": 0.75, "train_probe_rewards/brier_reward": 0.8717751403649648, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9973958432674408, "train_probe_rewards/mean_confidence_reward": 0.7434924840927124, "train_probe_runtime": 175.9645, "train_probe_samples_per_second": 5.683, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3615451405445735, "train_probe_signal/accuracy_reward/group_std_mean": 0.4288966655731201, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.18077257027228674, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.18077257027228674, "train_probe_signal/advantage_abs_mean": 0.23442783951759338, "train_probe_signal/advantage_pre_scale_abs_mean": 0.23442783951759338, "train_probe_signal/advantage_pre_scale_std": 0.28571516275405884, "train_probe_signal/advantage_std": 0.28571516275405884, "train_probe_signal/brier_reward/centered_abs_mean": 0.15385619054238, "train_probe_signal/brier_reward/group_std_mean": 0.21183102329572043, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07692809527119, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07692809527119, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.0050455727614462376, "train_probe_signal/format_reward/group_std_mean": 0.014731391333043575, "train_probe_signal/format_reward/group_zero_std_frac": 0.9166666865348816, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0025227863807231188, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.0025227863807231188, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.22009947150945663, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.2642533779144287, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.200994648167883e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.200994648167883e-06, "train_probe_steps_per_second": 0.034 }, { "calibration/aurc": 0.11232556419082203, "calibration/batch_distribution_entropy": 0.6128072063830443, "calibration/confidence_entropy": 0.35467244447989643, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.3577464782715678, "calibration/coverage@15%": 0.7843877942463057, "calibration/coverage@20%": 0.8925354721724787, "calibration/coverage@25%": 0.9113645017406441, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.3294851344670303, "calibration/ece": 0.0887839054646882, "calibration/mean_confidence": 0.7794962359854232, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0019097222222222321, "completions/max_length": 3480.6, "completions/max_terminated_length": 3480.6, "completions/mean_length": 963.3787353515625, "completions/mean_terminated_length": 965.2444091796875, "completions/min_length": 0.0, "completions/min_terminated_length": 288.4, "epoch": 1.453594330070874, "grad_norm": 0.00041539620724506676, "learning_rate": 2.6141826923076926e-06, "loss": -0.0003, "num_tokens": 1532028770.0, "reward": 1.290432858467102, "reward_std": 0.11487889736890793, "rewards/accuracy_reward": 0.7289930582046509, "rewards/brier_reward": 0.8537674069404602, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9980902910232544, "rewards/mean_confidence_reward": 0.7485014081001282, "signal/accuracy_reward/centered_abs_mean": 0.11991102546453476, "signal/accuracy_reward/group_std_mean": 0.16140546798706054, "signal/accuracy_reward/group_zero_std_frac": 0.5277777850627899, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05995551273226738, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05995551273226738, "signal/advantage_abs_mean": 0.08384716510772705, "signal/advantage_pre_scale_abs_mean": 0.08384716510772705, "signal/advantage_pre_scale_std": 0.16693741977214813, "signal/advantage_std": 0.16693741977214813, "signal/brier_reward/centered_abs_mean": 0.08437594920396804, "signal/brier_reward/group_std_mean": 0.11260297149419785, "signal/brier_reward/group_zero_std_frac": 0.22777777910232544, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04218797460198402, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04218797460198402, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0034830728778615593, "signal/format_reward/group_std_mean": 0.007145930593833327, "signal/format_reward/group_zero_std_frac": 0.9694444417953492, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0017415364389307797, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0017415364389307797, "signal/mean_confidence_reward/centered_abs_mean": 0.0689037449657917, "signal/mean_confidence_reward/group_std_mean": 0.09153866320848465, "signal/mean_confidence_reward/group_zero_std_frac": 0.24166666865348815, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.890374152135337e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.890374152135337e-07, "step": 605 }, { "calibration/aurc": 0.14914184118403714, "calibration/batch_distribution_entropy": 0.5667219257916708, "calibration/confidence_entropy": 0.3409189561364359, "calibration/coverage@0%": 0.14010416666666667, "calibration/coverage@1%": 0.1703125, "calibration/coverage@10%": 0.4489357636278303, "calibration/coverage@15%": 0.47457930638524737, "calibration/coverage@20%": 0.5414678823834975, "calibration/coverage@25%": 0.7586756523997649, "calibration/coverage@30%": 0.9671875, "calibration/coverage@5%": 0.3011234729493892, "calibration/ece": 0.12172503847529606, "calibration/mean_confidence": 0.8178183696897825, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001041666666666674, "completions/max_length": 3556.6, "completions/max_terminated_length": 3556.6, "completions/mean_length": 972.8282104492188, "completions/mean_terminated_length": 973.87724609375, "completions/min_length": 0.0, "completions/min_terminated_length": 277.2, "epoch": 1.465594180072749, "grad_norm": 0.000401933619286865, "learning_rate": 2.584134615384616e-06, "loss": 0.0002, "num_tokens": 1546364743.0, "reward": 1.2915134191513062, "reward_std": 0.10282141864299774, "rewards/accuracy_reward": 0.7364583373069763, "rewards/brier_reward": 0.8477681636810303, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9987847208976746, "rewards/mean_confidence_reward": 0.7801776647567749, "signal/accuracy_reward/centered_abs_mean": 0.09590928703546524, "signal/accuracy_reward/group_std_mean": 0.13645849376916885, "signal/accuracy_reward/group_zero_std_frac": 0.569444453716278, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04795464351773262, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04795464351773262, "signal/advantage_abs_mean": 0.07137535512447357, "signal/advantage_pre_scale_abs_mean": 0.07137535512447357, "signal/advantage_pre_scale_std": 0.15250917971134187, "signal/advantage_std": 0.15250917971134187, "signal/brier_reward/centered_abs_mean": 0.07281295061111451, "signal/brier_reward/group_std_mean": 0.10010344237089157, "signal/brier_reward/group_zero_std_frac": 0.30555556416511537, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036406475305557254, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036406475305557254, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003255208255723119, "signal/confidence_one_or_zero/group_std_mean": 0.0006831518840044737, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.2552080142522754e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.2552080142522754e-09, "signal/format_reward/centered_abs_mean": 0.002332899277098477, "signal/format_reward/group_std_mean": 0.006276767514646053, "signal/format_reward/group_zero_std_frac": 0.9666666746139526, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0011664496385492384, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0011664496385492384, "signal/mean_confidence_reward/centered_abs_mean": 0.06345794722437859, "signal/mean_confidence_reward/group_std_mean": 0.08363351076841355, "signal/mean_confidence_reward/group_zero_std_frac": 0.3361111134290695, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.345794304252195e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.345794304252195e-07, "step": 610 }, { "calibration/aurc": 0.1674503906225145, "calibration/batch_distribution_entropy": 0.48562553932022945, "calibration/confidence_entropy": 0.3086111717467877, "calibration/coverage@0%": 0.00575370855148342, "calibration/coverage@1%": 0.00575370855148342, "calibration/coverage@10%": 0.16648669284467713, "calibration/coverage@15%": 0.6439926919720768, "calibration/coverage@20%": 0.7097540357766142, "calibration/coverage@25%": 0.7738847076788831, "calibration/coverage@30%": 0.9302083333333332, "calibration/coverage@5%": 0.00575370855148342, "calibration/ece": 0.1198246073298429, "calibration/mean_confidence": 0.8333796356893541, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0008680555555555802, "completions/max_length": 3317.8, "completions/max_terminated_length": 3317.8, "completions/mean_length": 950.80009765625, "completions/mean_terminated_length": 951.632421875, "completions/min_length": 53.4, "completions/min_terminated_length": 283.4, "epoch": 1.477594030074624, "grad_norm": 0.00043170707067474723, "learning_rate": 2.554086538461539e-06, "loss": 0.0003, "num_tokens": 1560397544.0, "reward": 1.2985013246536254, "reward_std": 0.1115952417254448, "rewards/accuracy_reward": 0.7434027791023254, "rewards/brier_reward": 0.8544514536857605, "rewards/confidence_one_or_zero": 0.0003472222248092294, "rewards/format_reward": 0.9991319417953491, "rewards/mean_confidence_reward": 0.8247465014457702, "signal/accuracy_reward/centered_abs_mean": 0.1074001744389534, "signal/accuracy_reward/group_std_mean": 0.14306251108646392, "signal/accuracy_reward/group_zero_std_frac": 0.5861111104488372, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0537000872194767, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0537000872194767, "signal/advantage_abs_mean": 0.08258906453847885, "signal/advantage_pre_scale_abs_mean": 0.08258906453847885, "signal/advantage_pre_scale_std": 0.17162491977214814, "signal/advantage_std": 0.17162491977214814, "signal/brier_reward/centered_abs_mean": 0.07824950367212295, "signal/brier_reward/group_std_mean": 0.1050735890865326, "signal/brier_reward/group_zero_std_frac": 0.3222222238779068, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.039124751836061476, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.039124751836061476, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006076388992369175, "signal/confidence_one_or_zero/group_std_mean": 0.0009333631955087185, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.076388103792851e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.076388103792851e-09, "signal/format_reward/centered_abs_mean": 0.0016493055038154126, "signal/format_reward/group_std_mean": 0.004013641132041812, "signal/format_reward/group_zero_std_frac": 0.9805555462837219, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0008246527519077063, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0008246527519077063, "signal/mean_confidence_reward/centered_abs_mean": 0.05639765039086342, "signal/mean_confidence_reward/group_std_mean": 0.0743689127266407, "signal/mean_confidence_reward/group_zero_std_frac": 0.3583333432674408, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.639764879106224e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.639764879106224e-07, "step": 615 }, { "calibration/aurc": 0.09557332654028364, "calibration/batch_distribution_entropy": 0.4765025630816969, "calibration/confidence_entropy": 0.3034517021504529, "calibration/coverage@0%": 0.006773560209424083, "calibration/coverage@1%": 0.006773560209424083, "calibration/coverage@10%": 0.6819808027923211, "calibration/coverage@15%": 0.7539621509598604, "calibration/coverage@20%": 0.9316563045375219, "calibration/coverage@25%": 0.9802083333333332, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.17344022687609076, "calibration/ece": 0.08130260143979064, "calibration/mean_confidence": 0.844120609729494, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001649305555555558, "completions/max_length": 3412.8, "completions/max_terminated_length": 3412.8, "completions/mean_length": 918.356787109375, "completions/mean_terminated_length": 919.87001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 1.489593880076499, "grad_norm": 0.0004489246057346463, "learning_rate": 2.5240384615384618e-06, "loss": -0.0012, "num_tokens": 1574082710.0, "reward": 1.3020979642868042, "reward_std": 0.11776087433099747, "rewards/accuracy_reward": 0.7558159828186035, "rewards/brier_reward": 0.8500124335289001, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9983506917953491, "rewards/mean_confidence_reward": 0.844358229637146, "signal/accuracy_reward/centered_abs_mean": 0.1106174036860466, "signal/accuracy_reward/group_std_mean": 0.1486045241355896, "signal/accuracy_reward/group_zero_std_frac": 0.569444453716278, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0553087018430233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0553087018430233, "signal/advantage_abs_mean": 0.08657139092683792, "signal/advantage_pre_scale_abs_mean": 0.08657139092683792, "signal/advantage_pre_scale_std": 0.18212653696537018, "signal/advantage_std": 0.18212653696537018, "signal/brier_reward/centered_abs_mean": 0.08275578171014786, "signal/brier_reward/group_std_mean": 0.10967738628387451, "signal/brier_reward/group_zero_std_frac": 0.34166666865348816, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04137789085507393, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04137789085507393, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0030870226095430554, "signal/format_reward/group_std_mean": 0.006833086814731359, "signal/format_reward/group_zero_std_frac": 0.9694444298744201, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0015435113047715277, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0015435113047715277, "signal/mean_confidence_reward/centered_abs_mean": 0.050745067000389096, "signal/mean_confidence_reward/group_std_mean": 0.06767487451434136, "signal/mean_confidence_reward/group_zero_std_frac": 0.375, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.074506532309897e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.074506532309897e-07, "step": 620 }, { "calibration/aurc": 0.137915592409067, "calibration/batch_distribution_entropy": 0.49302740079844326, "calibration/confidence_entropy": 0.3139330907325345, "calibration/coverage@0%": 0.004698393264801171, "calibration/coverage@1%": 0.004698393264801171, "calibration/coverage@10%": 0.3471776690065115, "calibration/coverage@15%": 0.5800910634218692, "calibration/coverage@20%": 0.7249263601059879, "calibration/coverage@25%": 0.877351229329852, "calibration/coverage@30%": 0.941514360313316, "calibration/coverage@5%": 0.31793192293321304, "calibration/ece": 0.12001989337985686, "calibration/mean_confidence": 0.8215884130122255, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001128472222222232, "completions/max_length": 3232.4, "completions/max_terminated_length": 3232.4, "completions/mean_length": 918.64775390625, "completions/mean_terminated_length": 919.7128784179688, "completions/min_length": 0.0, "completions/min_terminated_length": 229.2, "epoch": 1.501593730078374, "grad_norm": 0.0005610015941783786, "learning_rate": 2.4939903846153847e-06, "loss": 0.0003, "num_tokens": 1587761916.0, "reward": 1.309776735305786, "reward_std": 0.12352517545223236, "rewards/accuracy_reward": 0.7489583373069764, "rewards/brier_reward": 0.8717072010040283, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9988715171813964, "rewards/mean_confidence_reward": 0.8192390203475952, "signal/accuracy_reward/centered_abs_mean": 0.1128363698720932, "signal/accuracy_reward/group_std_mean": 0.1548069804906845, "signal/accuracy_reward/group_zero_std_frac": 0.5388888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0564181849360466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0564181849360466, "signal/advantage_abs_mean": 0.087917160987854, "signal/advantage_pre_scale_abs_mean": 0.087917160987854, "signal/advantage_pre_scale_std": 0.1812780976295471, "signal/advantage_std": 0.1812780976295471, "signal/brier_reward/centered_abs_mean": 0.08223155289888381, "signal/brier_reward/group_std_mean": 0.1142003670334816, "signal/brier_reward/group_zero_std_frac": 0.33055556416511533, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04111577644944191, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04111577644944191, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0021321614272892474, "signal/format_reward/group_std_mean": 0.0051352059002965685, "signal/format_reward/group_zero_std_frac": 0.975, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0010660807136446237, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0010660807136446237, "signal/mean_confidence_reward/centered_abs_mean": 0.053113528341054914, "signal/mean_confidence_reward/group_std_mean": 0.07214976102113724, "signal/mean_confidence_reward/group_zero_std_frac": 0.3805555641651154, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.311352822445769e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.311352822445769e-07, "step": 625 }, { "calibration/aurc": 0.16120052839047785, "calibration/batch_distribution_entropy": 0.6061042013479356, "calibration/confidence_entropy": 0.3563380805583091, "calibration/coverage@0%": 0.0005208333333333333, "calibration/coverage@1%": 0.0005208333333333333, "calibration/coverage@10%": 0.2223958333333333, "calibration/coverage@15%": 0.4484375, "calibration/coverage@20%": 0.7281249999999999, "calibration/coverage@25%": 0.8494791666666666, "calibration/coverage@30%": 0.9348958333333333, "calibration/coverage@5%": 0.084375, "calibration/ece": 0.13630208333333332, "calibration/mean_confidence": 0.7799270833333333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0004340277777777901, "completions/max_length": 3547.8, "completions/max_terminated_length": 3547.8, "completions/mean_length": 878.0661499023438, "completions/mean_terminated_length": 878.4486450195312, "completions/min_length": 99.0, "completions/min_terminated_length": 257.2, "epoch": 1.513593580080249, "grad_norm": 0.0004561617679428309, "learning_rate": 2.463942307692308e-06, "loss": 0.0011, "num_tokens": 1600978070.0, "reward": 1.3056865453720092, "reward_std": 0.10008805692195892, "rewards/accuracy_reward": 0.7478298664093017, "rewards/brier_reward": 0.863961136341095, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9995659589767456, "rewards/mean_confidence_reward": 0.8043061256408691, "signal/accuracy_reward/centered_abs_mean": 0.0969021275639534, "signal/accuracy_reward/group_std_mean": 0.13032149970531465, "signal/accuracy_reward/group_zero_std_frac": 0.6222222208976745, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0484510637819767, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0484510637819767, "signal/advantage_abs_mean": 0.0733338326215744, "signal/advantage_pre_scale_abs_mean": 0.0733338326215744, "signal/advantage_pre_scale_std": 0.16065391898155212, "signal/advantage_std": 0.16065391898155212, "signal/brier_reward/centered_abs_mean": 0.07474518865346909, "signal/brier_reward/group_std_mean": 0.09937786757946014, "signal/brier_reward/group_zero_std_frac": 0.32777778506278993, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.037372594326734544, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.037372594326734544, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0008300781133584678, "signal/format_reward/group_std_mean": 0.0021562909241765738, "signal/format_reward/group_zero_std_frac": 0.9888888835906983, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0004150390566792339, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0004150390566792339, "signal/mean_confidence_reward/centered_abs_mean": 0.05449062511324883, "signal/mean_confidence_reward/group_std_mean": 0.07224412411451339, "signal/mean_confidence_reward/group_zero_std_frac": 0.35833333134651185, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.449062200568733e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.449062200568733e-07, "step": 630 }, { "calibration/aurc": 0.10439632054463635, "calibration/batch_distribution_entropy": 0.6704199273165201, "calibration/confidence_entropy": 0.3766526588101127, "calibration/coverage@0%": 0.1109375, "calibration/coverage@1%": 0.1109375, "calibration/coverage@10%": 0.6380590095986038, "calibration/coverage@15%": 0.7047665794066318, "calibration/coverage@20%": 0.8396924083769634, "calibration/coverage@25%": 0.9101794284467715, "calibration/coverage@30%": 0.9676238001745201, "calibration/coverage@5%": 0.3291175828970332, "calibration/ece": 0.08334786758289704, "calibration/mean_confidence": 0.7575420756980801, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001736111111111116, "completions/max_length": 3452.2, "completions/max_terminated_length": 3452.2, "completions/mean_length": 942.543505859375, "completions/mean_terminated_length": 944.2030883789063, "completions/min_length": 105.4, "completions/min_terminated_length": 263.4, "epoch": 1.525593430082124, "grad_norm": 0.0004556447092909366, "learning_rate": 2.433894230769231e-06, "loss": 0.0002, "num_tokens": 1614961131.0, "reward": 1.2861794471740722, "reward_std": 0.12219316810369492, "rewards/accuracy_reward": 0.7238715410232544, "rewards/brier_reward": 0.8502085566520691, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9982638835906983, "rewards/mean_confidence_reward": 0.7423978567123413, "signal/accuracy_reward/centered_abs_mean": 0.12494032233953475, "signal/accuracy_reward/group_std_mean": 0.1702763855457306, "signal/accuracy_reward/group_zero_std_frac": 0.4944444477558136, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06247016116976738, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06247016116976738, "signal/advantage_abs_mean": 0.09004508405923843, "signal/advantage_pre_scale_abs_mean": 0.09004508405923843, "signal/advantage_pre_scale_std": 0.1708286613225937, "signal/advantage_std": 0.1708286613225937, "signal/brier_reward/centered_abs_mean": 0.09039755314588546, "signal/brier_reward/group_std_mean": 0.11918909549713134, "signal/brier_reward/group_zero_std_frac": 0.2, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.04519877657294273, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.04519877657294273, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.003081597248092294, "signal/format_reward/group_std_mean": 0.0056969795376062395, "signal/format_reward/group_zero_std_frac": 0.9777777791023254, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001540798624046147, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001540798624046147, "signal/mean_confidence_reward/centered_abs_mean": 0.07954908609390259, "signal/mean_confidence_reward/group_std_mean": 0.10289034098386765, "signal/mean_confidence_reward/group_zero_std_frac": 0.21666666865348816, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 7.954908369356418e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 7.954908369356418e-07, "step": 635 }, { "calibration/aurc": 0.12127524100223024, "calibration/batch_distribution_entropy": 0.7103595338182772, "calibration/confidence_entropy": 0.39015717023097285, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.11832460732984293, "calibration/coverage@10%": 0.5184227748691099, "calibration/coverage@15%": 0.5872218586387434, "calibration/coverage@20%": 0.7231893542757417, "calibration/coverage@25%": 0.8232820680628272, "calibration/coverage@30%": 0.9078125, "calibration/coverage@5%": 0.4178773996509599, "calibration/ece": 0.11806672665794068, "calibration/mean_confidence": 0.7271903086823734, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001388888888888884, "completions/max_length": 3763.4, "completions/max_terminated_length": 3763.4, "completions/mean_length": 947.674658203125, "completions/mean_terminated_length": 949.0006958007813, "completions/min_length": 45.6, "completions/min_terminated_length": 244.4, "epoch": 1.5375932800839989, "grad_norm": 0.00047028163680806756, "learning_rate": 2.403846153846154e-06, "loss": 0.0007, "num_tokens": 1628955207.0, "reward": 1.2948758125305175, "reward_std": 0.09973684251308441, "rewards/accuracy_reward": 0.7269097089767456, "rewards/brier_reward": 0.8642160773277283, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986111044883728, "rewards/mean_confidence_reward": 0.7325807094573975, "signal/accuracy_reward/centered_abs_mean": 0.10819227397441863, "signal/accuracy_reward/group_std_mean": 0.14667461216449737, "signal/accuracy_reward/group_zero_std_frac": 0.5638889074325562, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05409613698720932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05409613698720932, "signal/advantage_abs_mean": 0.07218273133039474, "signal/advantage_pre_scale_abs_mean": 0.07218273133039474, "signal/advantage_pre_scale_std": 0.1484358012676239, "signal/advantage_std": 0.1484358012676239, "signal/brier_reward/centered_abs_mean": 0.07523007243871689, "signal/brier_reward/group_std_mean": 0.10071428418159485, "signal/brier_reward/group_zero_std_frac": 0.2416666716337204, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.037615036219358446, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.037615036219358446, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0025716146104969085, "signal/format_reward/group_std_mean": 0.0055775225162506105, "signal/format_reward/group_zero_std_frac": 0.9749999880790711, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0012858073052484543, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0012858073052484543, "signal/mean_confidence_reward/centered_abs_mean": 0.0683393806219101, "signal/mean_confidence_reward/group_std_mean": 0.08972892165184021, "signal/mean_confidence_reward/group_zero_std_frac": 0.2527777761220932, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.83393784584041e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.83393784584041e-07, "step": 640 }, { "calibration/aurc": 0.11004002124506543, "calibration/batch_distribution_entropy": 0.6009934588435399, "calibration/confidence_entropy": 0.35327770639864353, "calibration/coverage@0%": 0.12135416666666668, "calibration/coverage@1%": 0.17604166666666668, "calibration/coverage@10%": 0.6166666666666667, "calibration/coverage@15%": 0.703125, "calibration/coverage@20%": 0.7557291666666667, "calibration/coverage@25%": 0.7885416666666667, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.3421875, "calibration/ece": 0.1197135416666667, "calibration/mean_confidence": 0.7921614583333333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009548611111111161, "completions/max_length": 3660.0, "completions/max_terminated_length": 3660.0, "completions/mean_length": 871.0027954101563, "completions/mean_terminated_length": 871.8511352539062, "completions/min_length": 53.0, "completions/min_terminated_length": 251.8, "epoch": 1.5495931300858738, "grad_norm": 0.0003979683096986264, "learning_rate": 2.373798076923077e-06, "loss": 0.0006, "num_tokens": 1642088199.0, "reward": 1.3260669231414794, "reward_std": 0.1002319410443306, "rewards/accuracy_reward": 0.7807291746139526, "rewards/brier_reward": 0.8723434329032898, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9990451335906982, "rewards/mean_confidence_reward": 0.7999942064285278, "signal/accuracy_reward/centered_abs_mean": 0.10078125149011612, "signal/accuracy_reward/group_std_mean": 0.1380949318408966, "signal/accuracy_reward/group_zero_std_frac": 0.5861111164093018, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05039062574505806, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05039062574505806, "signal/advantage_abs_mean": 0.07198782041668891, "signal/advantage_pre_scale_abs_mean": 0.07198782041668891, "signal/advantage_pre_scale_std": 0.15460035502910613, "signal/advantage_std": 0.15460035502910613, "signal/brier_reward/centered_abs_mean": 0.06760974749922752, "signal/brier_reward/group_std_mean": 0.09188247472047806, "signal/brier_reward/group_zero_std_frac": 0.27500000298023225, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03380487374961376, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03380487374961376, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0018283419776707888, "signal/format_reward/group_std_mean": 0.004803628101944924, "signal/format_reward/group_zero_std_frac": 0.975, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0009141709888353944, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0009141709888353944, "signal/mean_confidence_reward/centered_abs_mean": 0.060545888543128965, "signal/mean_confidence_reward/group_std_mean": 0.08132546544075012, "signal/mean_confidence_reward/group_zero_std_frac": 0.28333333432674407, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.054588766346569e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.054588766346569e-07, "step": 645 }, { "calibration/aurc": 0.22140432182266445, "calibration/batch_distribution_entropy": 0.643554486456017, "calibration/confidence_entropy": 0.3781377027903585, "calibration/coverage@0%": 0.034375, "calibration/coverage@1%": 0.034375, "calibration/coverage@10%": 0.05520833333333334, "calibration/coverage@15%": 0.109375, "calibration/coverage@20%": 0.3876767841601393, "calibration/coverage@25%": 0.7508322454308094, "calibration/coverage@30%": 0.7920855091383812, "calibration/coverage@5%": 0.034375, "calibration/ece": 0.16447961995938498, "calibration/mean_confidence": 0.7615519382796634, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001041666666666674, "completions/max_length": 3650.2, "completions/max_terminated_length": 3650.2, "completions/mean_length": 911.3776977539062, "completions/mean_terminated_length": 912.3047485351562, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 1.5615929800877488, "grad_norm": 0.0005324080120772123, "learning_rate": 2.3437500000000002e-06, "loss": -0.0004, "num_tokens": 1655682758.0, "reward": 1.290390968322754, "reward_std": 0.11039121299982071, "rewards/accuracy_reward": 0.7323784708976746, "rewards/brier_reward": 0.8495159983634949, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9988715171813964, "rewards/mean_confidence_reward": 0.7978532910346985, "signal/accuracy_reward/centered_abs_mean": 0.10956488698720931, "signal/accuracy_reward/group_std_mean": 0.1497401401400566, "signal/accuracy_reward/group_zero_std_frac": 0.5555555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05478244349360466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05478244349360466, "signal/advantage_abs_mean": 0.08013438433408737, "signal/advantage_pre_scale_abs_mean": 0.08013438433408737, "signal/advantage_pre_scale_std": 0.16512468755245208, "signal/advantage_std": 0.16512468755245208, "signal/brier_reward/centered_abs_mean": 0.07343227863311767, "signal/brier_reward/group_std_mean": 0.09887117743492127, "signal/brier_reward/group_zero_std_frac": 0.25833334028720856, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036716139316558837, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036716139316558837, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0021213107742369176, "signal/format_reward/group_std_mean": 0.005082572065293789, "signal/format_reward/group_zero_std_frac": 0.9749999880790711, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0010606553871184588, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0010606553871184588, "signal/mean_confidence_reward/centered_abs_mean": 0.05819051116704941, "signal/mean_confidence_reward/group_std_mean": 0.07811878025531768, "signal/mean_confidence_reward/group_zero_std_frac": 0.28333333432674407, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.819051011712873e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.819051011712873e-07, "step": 650 }, { "epoch": 1.5615929800877488, "eval_calibration/aurc": 0.12969456831486678, "eval_calibration/batch_distribution_entropy": 0.583502799492367, "eval_calibration/confidence_entropy": 0.3571954370842699, "eval_calibration/coverage@0%": 0.16666666666666666, "eval_calibration/coverage@1%": 0.16666666666666666, "eval_calibration/coverage@10%": 0.44590053763440857, "eval_calibration/coverage@15%": 0.6234879032258065, "eval_calibration/coverage@20%": 0.7642809139784946, "eval_calibration/coverage@25%": 0.8746639784946236, "eval_calibration/coverage@30%": 0.9322916666666666, "eval_calibration/coverage@5%": 0.1875, "eval_calibration/ece": 0.14947076612903223, "eval_calibration/mean_confidence": 0.7873571908602152, "eval_completions/clipped_ratio": 0.0017361111111111234, "eval_completions/max_length": 2769.6666666666665, "eval_completions/max_terminated_length": 2769.6666666666665, "eval_completions/mean_length": 917.9793701171875, "eval_completions/mean_terminated_length": 919.5607503255209, "eval_completions/min_length": 204.0, "eval_completions/min_terminated_length": 277.8333333333333, "eval_loss": 0.0, "eval_num_tokens": 1655682758.0, "eval_reward": 1.2870293656984966, "eval_reward_std": 0.3230198224385579, "eval_rewards/accuracy_reward": 0.7230902910232544, "eval_rewards/brier_reward": 0.8544248839219412, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.996527781089147, "eval_rewards/mean_confidence_reward": 0.785980890194575, "eval_runtime": 180.2669, "eval_samples_per_second": 5.547, "eval_signal/accuracy_reward/centered_abs_mean": 0.3888346403837204, "eval_signal/accuracy_reward/group_std_mean": 0.4460001389185588, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1944173201918602, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.1944173201918602, "eval_signal/advantage_abs_mean": 0.27036121984322864, "eval_signal/advantage_pre_scale_abs_mean": 0.27036121984322864, "eval_signal/advantage_pre_scale_std": 0.3217717260122299, "eval_signal/advantage_std": 0.3217717260122299, "eval_signal/brier_reward/centered_abs_mean": 0.18200468023618063, "eval_signal/brier_reward/group_std_mean": 0.2484159121910731, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09100234011809032, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.09100234011809032, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.006618923507630825, "eval_signal/format_reward/group_std_mean": 0.01665244624018669, "eval_signal/format_reward/group_zero_std_frac": 0.9166666865348816, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0033094617538154125, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0033094617538154125, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.1922797461350759, "eval_signal/mean_confidence_reward/group_std_mean": 0.23714462170998254, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.9227974045558463e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.9227974045558463e-06, "eval_steps_per_second": 0.033, "step": 650 }, { "epoch": 1.5615929800877488, "step": 650, "train_probe_calibration/aurc": 0.11233215280938669, "train_probe_calibration/batch_distribution_entropy": 0.6155186524530678, "train_probe_calibration/confidence_entropy": 0.37468261908883665, "train_probe_calibration/coverage@0%": 0.23958333333333334, "train_probe_calibration/coverage@1%": 0.23958333333333334, "train_probe_calibration/coverage@10%": 0.4739583333333333, "train_probe_calibration/coverage@15%": 0.7291666666666666, "train_probe_calibration/coverage@20%": 0.8541666666666666, "train_probe_calibration/coverage@25%": 0.9427083333333334, "train_probe_calibration/coverage@30%": 0.984375, "train_probe_calibration/coverage@5%": 0.3229166666666667, "train_probe_calibration/ece": 0.121875, "train_probe_calibration/mean_confidence": 0.7916666666666666, "train_probe_completions/clipped_ratio": 0.0008680555555555617, "train_probe_completions/max_length": 2825.3333333333335, "train_probe_completions/max_terminated_length": 2825.3333333333335, "train_probe_completions/mean_length": 940.2074890136719, "train_probe_completions/mean_terminated_length": 941.0739949544271, "train_probe_completions/min_length": 196.0, "train_probe_completions/min_terminated_length": 239.0, "train_probe_loss": 0.0, "train_probe_num_tokens": 1655682758.0, "train_probe_reward": 1.3190205693244934, "train_probe_reward_std": 0.2966742565234502, "train_probe_rewards/accuracy_reward": 0.7673611144224802, "train_probe_rewards/brier_reward": 0.8715321123600006, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9991319477558136, "train_probe_rewards/mean_confidence_reward": 0.7901591261227926, "train_probe_runtime": 156.331, "train_probe_samples_per_second": 6.397, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3495008697112401, "train_probe_signal/accuracy_reward/group_std_mean": 0.4226486037174861, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.17475043485562006, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.17475043485562006, "train_probe_signal/advantage_abs_mean": 0.2393521616856257, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2393521616856257, "train_probe_signal/advantage_pre_scale_std": 0.29541688164075214, "train_probe_signal/advantage_std": 0.29541688164075214, "train_probe_signal/brier_reward/centered_abs_mean": 0.1622385506828626, "train_probe_signal/brier_reward/group_std_mean": 0.2240768993894259, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0811192753414313, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.0811192753414313, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.0016818575871487458, "train_probe_signal/format_reward/group_std_mean": 0.0049104637776811915, "train_probe_signal/format_reward/group_zero_std_frac": 0.9722222288449606, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0008409287935743729, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.0008409287935743729, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.18782227983077368, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.23230999459822974, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.878222785004861e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.878222785004861e-06, "train_probe_steps_per_second": 0.038 }, { "calibration/aurc": 0.09526697738145444, "calibration/batch_distribution_entropy": 0.7055263057305656, "calibration/confidence_entropy": 0.40520614262227106, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.11927083333333333, "calibration/coverage@10%": 0.5920824601484128, "calibration/coverage@15%": 0.7792822419953277, "calibration/coverage@20%": 0.8623706827447208, "calibration/coverage@25%": 0.9580400863451055, "calibration/coverage@30%": 0.9726315789473684, "calibration/coverage@5%": 0.41194030278044985, "calibration/ece": 0.09347333135793137, "calibration/mean_confidence": 0.7569156467580505, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001041666666666674, "completions/max_length": 3735.4, "completions/max_terminated_length": 3735.4, "completions/mean_length": 956.2429809570312, "completions/mean_terminated_length": 957.259765625, "completions/min_length": 47.8, "completions/min_terminated_length": 227.6, "epoch": 1.5735928300896238, "grad_norm": 0.000430108280852437, "learning_rate": 2.3137019230769236e-06, "loss": 0.0021, "num_tokens": 1669815957.0, "reward": 1.3189053535461426, "reward_std": 0.1133571743965149, "rewards/accuracy_reward": 0.7635416626930237, "rewards/brier_reward": 0.8752951145172119, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9989583253860473, "rewards/mean_confidence_reward": 0.7757572293281555, "signal/accuracy_reward/centered_abs_mean": 0.11478949785232544, "signal/accuracy_reward/group_std_mean": 0.15749964416027068, "signal/accuracy_reward/group_zero_std_frac": 0.5305555641651154, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05739474892616272, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05739474892616272, "signal/advantage_abs_mean": 0.0811674177646637, "signal/advantage_pre_scale_abs_mean": 0.0811674177646637, "signal/advantage_pre_scale_std": 0.1642020434141159, "signal/advantage_std": 0.1642020434141159, "signal/brier_reward/centered_abs_mean": 0.07032931447029114, "signal/brier_reward/group_std_mean": 0.09857706129550933, "signal/brier_reward/group_zero_std_frac": 0.21111111640930175, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03516465723514557, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03516465723514557, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0019531250349245966, "signal/format_reward/group_std_mean": 0.004861734248697758, "signal/format_reward/group_zero_std_frac": 0.975, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0009765625174622983, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0009765625174622983, "signal/mean_confidence_reward/centered_abs_mean": 0.06286813393235206, "signal/mean_confidence_reward/group_std_mean": 0.08201692551374436, "signal/mean_confidence_reward/group_zero_std_frac": 0.2333333373069763, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.286813004408032e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.286813004408032e-07, "step": 655 }, { "calibration/aurc": 0.08112570670516984, "calibration/batch_distribution_entropy": 0.5987221368974365, "calibration/confidence_entropy": 0.3571555191027245, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.8462123691099477, "calibration/coverage@15%": 0.9014725130890053, "calibration/coverage@20%": 0.9432291666666666, "calibration/coverage@25%": 0.9817708333333334, "calibration/coverage@30%": 0.9828125, "calibration/coverage@5%": 0.37713241710296685, "calibration/ece": 0.0800317135689355, "calibration/mean_confidence": 0.7936851821553228, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0012152777777778123, "completions/max_length": 3620.6, "completions/max_terminated_length": 3620.6, "completions/mean_length": 884.5284790039062, "completions/mean_terminated_length": 885.598193359375, "completions/min_length": 0.0, "completions/min_terminated_length": 236.8, "epoch": 1.5855926800914988, "grad_norm": 0.0005706451484002173, "learning_rate": 2.283653846153846e-06, "loss": 0.0007, "num_tokens": 1683100285.0, "reward": 1.31890127658844, "reward_std": 0.1130586788058281, "rewards/accuracy_reward": 0.7696180582046509, "rewards/brier_reward": 0.8694704532623291, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9986979126930237, "rewards/mean_confidence_reward": 0.8063099026679993, "signal/accuracy_reward/centered_abs_mean": 0.11297742873430253, "signal/accuracy_reward/group_std_mean": 0.15261925756931305, "signal/accuracy_reward/group_zero_std_frac": 0.5527777791023254, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05648871436715126, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05648871436715126, "signal/advantage_abs_mean": 0.08314946293830872, "signal/advantage_pre_scale_abs_mean": 0.08314946293830872, "signal/advantage_pre_scale_std": 0.17104166150093078, "signal/advantage_std": 0.17104166150093078, "signal/brier_reward/centered_abs_mean": 0.07322535514831544, "signal/brier_reward/group_std_mean": 0.09897147268056869, "signal/brier_reward/group_zero_std_frac": 0.28333333134651184, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03661267757415772, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03661267757415772, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.00240342877805233, "signal/format_reward/group_std_mean": 0.004840169008821249, "signal/format_reward/group_zero_std_frac": 0.9805555462837219, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001201714389026165, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001201714389026165, "signal/mean_confidence_reward/centered_abs_mean": 0.056373240798711775, "signal/mean_confidence_reward/group_std_mean": 0.07522589564323426, "signal/mean_confidence_reward/group_zero_std_frac": 0.30555555522441863, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.63732407954376e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.63732407954376e-07, "step": 660 }, { "calibration/aurc": 0.1186653450567019, "calibration/batch_distribution_entropy": 0.5688573790100714, "calibration/confidence_entropy": 0.3457176775316543, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.4989583333333334, "calibration/coverage@15%": 0.7122971491228071, "calibration/coverage@20%": 0.7336842105263158, "calibration/coverage@25%": 0.8536513157894736, "calibration/coverage@30%": 0.9281907894736843, "calibration/coverage@5%": 0.4604166666666667, "calibration/ece": 0.11004111842105262, "calibration/mean_confidence": 0.8104676535087718, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011284722222222542, "completions/max_length": 3661.8, "completions/max_terminated_length": 3661.8, "completions/mean_length": 901.5636352539062, "completions/mean_terminated_length": 902.6055541992188, "completions/min_length": 53.2, "completions/min_terminated_length": 222.2, "epoch": 1.5975925300933738, "grad_norm": 0.0004567387804854661, "learning_rate": 2.2536057692307694e-06, "loss": -0.0002, "num_tokens": 1696597274.0, "reward": 1.315671968460083, "reward_std": 0.10606003403663636, "rewards/accuracy_reward": 0.7618923544883728, "rewards/brier_reward": 0.8705644965171814, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9988715291023255, "rewards/mean_confidence_reward": 0.7788426280021667, "signal/accuracy_reward/centered_abs_mean": 0.11060655564069748, "signal/accuracy_reward/group_std_mean": 0.1506906658411026, "signal/accuracy_reward/group_zero_std_frac": 0.5499999940395355, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05530327782034874, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05530327782034874, "signal/advantage_abs_mean": 0.07672593295574189, "signal/advantage_pre_scale_abs_mean": 0.07672593295574189, "signal/advantage_pre_scale_std": 0.1597493588924408, "signal/advantage_std": 0.1597493588924408, "signal/brier_reward/centered_abs_mean": 0.06591477692127228, "signal/brier_reward/group_std_mean": 0.09026184827089309, "signal/brier_reward/group_zero_std_frac": 0.2361111134290695, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03295738846063614, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03295738846063614, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0020562065998092293, "signal/format_reward/group_std_mean": 0.004051749873906374, "signal/format_reward/group_zero_std_frac": 0.9833333253860473, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0010281032999046146, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0010281032999046146, "signal/mean_confidence_reward/centered_abs_mean": 0.05552149266004562, "signal/mean_confidence_reward/group_std_mean": 0.07359625548124313, "signal/mean_confidence_reward/group_zero_std_frac": 0.2666666716337204, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.552149104914861e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.552149104914861e-07, "step": 665 }, { "calibration/aurc": 0.028898431146313262, "calibration/batch_distribution_entropy": 0.5775599915758309, "calibration/confidence_entropy": 0.3440640818260372, "calibration/coverage@0%": 0.09947916666666666, "calibration/coverage@1%": 0.5814163394415357, "calibration/coverage@10%": 0.9119437172774868, "calibration/coverage@15%": 0.965625, "calibration/coverage@20%": 1.0, "calibration/coverage@25%": 1.0, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.811297447643979, "calibration/ece": 0.10468899978184995, "calibration/mean_confidence": 0.8086489692408376, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333259, "completions/max_length": 3444.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 908.05478515625, "completions/mean_terminated_length": 909.2998901367188, "completions/min_length": 86.2, "completions/min_terminated_length": 224.0, "epoch": 1.6095923800952487, "grad_norm": 0.000404239195631817, "learning_rate": 2.2235576923076924e-06, "loss": 0.0015, "num_tokens": 1710165169.0, "reward": 1.3232361555099488, "reward_std": 0.09141702204942703, "rewards/accuracy_reward": 0.7634548664093017, "rewards/brier_reward": 0.8843045830726624, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979246139527, "rewards/mean_confidence_reward": 0.7453715443611145, "signal/accuracy_reward/centered_abs_mean": 0.10269639939069748, "signal/accuracy_reward/group_std_mean": 0.13939371109008789, "signal/accuracy_reward/group_zero_std_frac": 0.5861111164093018, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05134819969534874, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05134819969534874, "signal/advantage_abs_mean": 0.06519015058875084, "signal/advantage_pre_scale_abs_mean": 0.06519015058875084, "signal/advantage_pre_scale_std": 0.13848203122615815, "signal/advantage_std": 0.13848203122615815, "signal/brier_reward/centered_abs_mean": 0.05801916942000389, "signal/brier_reward/group_std_mean": 0.08050897121429443, "signal/brier_reward/group_zero_std_frac": 0.21111111342906952, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.029009584710001944, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.029009584710001944, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.002435980876907706, "signal/format_reward/group_std_mean": 0.005736991390585899, "signal/format_reward/group_zero_std_frac": 0.9722222208976745, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001217990438453853, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001217990438453853, "signal/mean_confidence_reward/centered_abs_mean": 0.06191907525062561, "signal/mean_confidence_reward/group_std_mean": 0.08199459463357925, "signal/mean_confidence_reward/group_zero_std_frac": 0.2166666716337204, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.19190757333854e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.19190757333854e-07, "step": 670 }, { "calibration/aurc": 0.04517212681603921, "calibration/batch_distribution_entropy": 0.6908644696230866, "calibration/confidence_entropy": 0.41022989532378507, "calibration/coverage@0%": 0.3572916666666667, "calibration/coverage@1%": 0.39895833333333336, "calibration/coverage@10%": 0.8421875, "calibration/coverage@15%": 0.9229166666666666, "calibration/coverage@20%": 0.9458333333333334, "calibration/coverage@25%": 0.9572916666666668, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.709375, "calibration/ece": 0.1308854166666667, "calibration/mean_confidence": 0.7482291666666666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001128472222222232, "completions/max_length": 3642.8, "completions/max_terminated_length": 3642.8, "completions/mean_length": 933.2782958984375, "completions/mean_terminated_length": 934.3715209960938, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 1.6215922300971237, "grad_norm": 0.0003739423700608313, "learning_rate": 2.1935096153846157e-06, "loss": 0.0006, "num_tokens": 1724011863.0, "reward": 1.3145429372787476, "reward_std": 0.09475865215063095, "rewards/accuracy_reward": 0.7628472208976745, "rewards/brier_reward": 0.8674393415451049, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9987847089767456, "rewards/mean_confidence_reward": 0.7291562557220459, "signal/accuracy_reward/centered_abs_mean": 0.09700520783662796, "signal/accuracy_reward/group_std_mean": 0.13672259598970413, "signal/accuracy_reward/group_zero_std_frac": 0.5777777791023254, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04850260391831398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04850260391831398, "signal/advantage_abs_mean": 0.0654190570116043, "signal/advantage_pre_scale_abs_mean": 0.0654190570116043, "signal/advantage_pre_scale_std": 0.13994845300912856, "signal/advantage_std": 0.13994845300912856, "signal/brier_reward/centered_abs_mean": 0.06118583530187607, "signal/brier_reward/group_std_mean": 0.08580977916717529, "signal/brier_reward/group_zero_std_frac": 0.12500000298023223, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.030592917650938033, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.030592917650938033, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.002311197901144624, "signal/format_reward/group_std_mean": 0.005925193056464195, "signal/format_reward/group_zero_std_frac": 0.9694444417953492, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001155598950572312, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001155598950572312, "signal/mean_confidence_reward/centered_abs_mean": 0.058203304558992384, "signal/mean_confidence_reward/group_std_mean": 0.07827859669923783, "signal/mean_confidence_reward/group_zero_std_frac": 0.1361111119389534, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.820330329697754e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.820330329697754e-07, "step": 675 }, { "calibration/aurc": 0.11593377351724402, "calibration/batch_distribution_entropy": 0.7156221450346608, "calibration/confidence_entropy": 0.4369838868857605, "calibration/coverage@0%": 0.10123851706036743, "calibration/coverage@1%": 0.10123851706036743, "calibration/coverage@10%": 0.46845882545931755, "calibration/coverage@15%": 0.6609046916010499, "calibration/coverage@20%": 0.872014435695538, "calibration/coverage@25%": 0.8902682086614174, "calibration/coverage@30%": 0.9441437007874015, "calibration/coverage@5%": 0.3424171587926509, "calibration/ece": 0.10972945374015745, "calibration/mean_confidence": 0.7415065206692913, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001128472222222232, "completions/max_length": 3547.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 905.365625, "completions/mean_terminated_length": 906.4061401367187, "completions/min_length": 0.0, "completions/min_terminated_length": 255.4, "epoch": 1.6335920800989987, "grad_norm": 0.0004761804884765297, "learning_rate": 2.1634615384615387e-06, "loss": 0.0022, "num_tokens": 1737579787.0, "reward": 1.3048258304595948, "reward_std": 0.10577225983142853, "rewards/accuracy_reward": 0.7521701455116272, "rewards/brier_reward": 0.8585953116416931, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9988715171813964, "rewards/mean_confidence_reward": 0.7304053783416748, "signal/accuracy_reward/centered_abs_mean": 0.12133789211511611, "signal/accuracy_reward/group_std_mean": 0.16077230870723724, "signal/accuracy_reward/group_zero_std_frac": 0.5361111283302307, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06066894605755806, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06066894605755806, "signal/advantage_abs_mean": 0.0790418416261673, "signal/advantage_pre_scale_abs_mean": 0.0790418416261673, "signal/advantage_pre_scale_std": 0.15420669615268706, "signal/advantage_std": 0.15420669615268706, "signal/brier_reward/centered_abs_mean": 0.06567397564649582, "signal/brier_reward/group_std_mean": 0.08779519647359849, "signal/brier_reward/group_zero_std_frac": 0.08888889029622078, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03283698782324791, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03283698782324791, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.002143012115266174, "signal/format_reward/group_std_mean": 0.005434146523475647, "signal/format_reward/group_zero_std_frac": 0.9722222089767456, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001071506057633087, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001071506057633087, "signal/mean_confidence_reward/centered_abs_mean": 0.06179487705230713, "signal/mean_confidence_reward/group_std_mean": 0.08049440681934357, "signal/mean_confidence_reward/group_zero_std_frac": 0.09166666939854622, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.179487513691129e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.179487513691129e-07, "step": 680 }, { "calibration/aurc": 0.13635622636716482, "calibration/batch_distribution_entropy": 0.7087501824105223, "calibration/confidence_entropy": 0.4262381415794156, "calibration/coverage@0%": 0.14791666666666667, "calibration/coverage@1%": 0.2265625, "calibration/coverage@10%": 0.4767406440382941, "calibration/coverage@15%": 0.5449698107049608, "calibration/coverage@20%": 0.694620322019147, "calibration/coverage@25%": 0.7800709856396867, "calibration/coverage@30%": 0.8452445060922542, "calibration/coverage@5%": 0.3921875, "calibration/ece": 0.13608742656657966, "calibration/mean_confidence": 0.7271390203437772, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001041666666666674, "completions/max_length": 3708.4, "completions/max_terminated_length": 3708.4, "completions/mean_length": 939.0347412109375, "completions/mean_terminated_length": 940.027197265625, "completions/min_length": 35.6, "completions/min_terminated_length": 205.2, "epoch": 1.6455919301008737, "grad_norm": 0.0004425387887749821, "learning_rate": 2.1334134615384616e-06, "loss": 0.0007, "num_tokens": 1751512219.0, "reward": 1.3130683898925781, "reward_std": 0.08265728801488877, "rewards/accuracy_reward": 0.7550347208976745, "rewards/brier_reward": 0.8721288919448853, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9989583253860473, "rewards/mean_confidence_reward": 0.738863718509674, "signal/accuracy_reward/centered_abs_mean": 0.08947482705116272, "signal/accuracy_reward/group_std_mean": 0.12329307794570923, "signal/accuracy_reward/group_zero_std_frac": 0.6222222208976745, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04473741352558136, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04473741352558136, "signal/advantage_abs_mean": 0.05821612551808357, "signal/advantage_pre_scale_abs_mean": 0.05821612551808357, "signal/advantage_pre_scale_std": 0.12943169772624968, "signal/advantage_std": 0.12943169772624968, "signal/brier_reward/centered_abs_mean": 0.05658187344670296, "signal/brier_reward/group_std_mean": 0.07787532731890678, "signal/brier_reward/group_zero_std_frac": 0.16944444477558135, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.02829093672335148, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.02829093672335148, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0019639757112599908, "signal/format_reward/group_std_mean": 0.004644159367308021, "signal/format_reward/group_zero_std_frac": 0.9777777671813965, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0009819878556299954, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0009819878556299954, "signal/mean_confidence_reward/centered_abs_mean": 0.05504027381539345, "signal/mean_confidence_reward/group_std_mean": 0.07294548600912094, "signal/mean_confidence_reward/group_zero_std_frac": 0.17500000298023224, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.504027399183542e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.504027399183542e-07, "step": 685 }, { "calibration/aurc": 0.10829552369532418, "calibration/batch_distribution_entropy": 0.726435924371739, "calibration/confidence_entropy": 0.40613672513819044, "calibration/coverage@0%": 0.17360154199475067, "calibration/coverage@1%": 0.20120570866141732, "calibration/coverage@10%": 0.45213765350561597, "calibration/coverage@15%": 0.7995615624336122, "calibration/coverage@20%": 0.871646540469974, "calibration/coverage@25%": 0.9316294060052218, "calibration/coverage@30%": 0.9733681462140993, "calibration/coverage@5%": 0.38824487058243046, "calibration/ece": 0.1130952371927317, "calibration/mean_confidence": 0.7281674831289447, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0019097222222222321, "completions/max_length": 3939.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 940.6310913085938, "completions/mean_terminated_length": 942.4517333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 1.6575917801027487, "grad_norm": 0.0004989943699911237, "learning_rate": 2.103365384615385e-06, "loss": -0.0008, "num_tokens": 1765446881.0, "reward": 1.3000996112823486, "reward_std": 0.10869093090295792, "rewards/accuracy_reward": 0.7424479126930237, "rewards/brier_reward": 0.859645938873291, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9980902910232544, "rewards/mean_confidence_reward": 0.7536944389343262, "signal/accuracy_reward/centered_abs_mean": 0.11841905564069748, "signal/accuracy_reward/group_std_mean": 0.1600315272808075, "signal/accuracy_reward/group_zero_std_frac": 0.5277777791023255, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05920952782034874, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05920952782034874, "signal/advantage_abs_mean": 0.07815827280282975, "signal/advantage_pre_scale_abs_mean": 0.07815827280282975, "signal/advantage_pre_scale_std": 0.15756354928016664, "signal/advantage_std": 0.15756354928016664, "signal/brier_reward/centered_abs_mean": 0.06629163175821304, "signal/brier_reward/group_std_mean": 0.0902453675866127, "signal/brier_reward/group_zero_std_frac": 0.1833333343267441, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03314581587910652, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03314581587910652, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0035373263410292564, "signal/format_reward/group_std_mean": 0.00784455295652151, "signal/format_reward/group_zero_std_frac": 0.9638888835906982, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0017686631705146282, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0017686631705146282, "signal/mean_confidence_reward/centered_abs_mean": 0.05965604856610298, "signal/mean_confidence_reward/group_std_mean": 0.07961117178201675, "signal/mean_confidence_reward/group_zero_std_frac": 0.18888889253139496, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.965604827906645e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.965604827906645e-07, "step": 690 }, { "calibration/aurc": 0.12349518727525981, "calibration/batch_distribution_entropy": 0.6866424399814275, "calibration/confidence_entropy": 0.4288375245089998, "calibration/coverage@0%": 0.0715487151298612, "calibration/coverage@1%": 0.114890751683386, "calibration/coverage@10%": 0.47619426822861766, "calibration/coverage@15%": 0.64311634684311, "calibration/coverage@20%": 0.73093070615516, "calibration/coverage@25%": 0.8014168554382894, "calibration/coverage@30%": 0.9336195692109855, "calibration/coverage@5%": 0.41226124376786843, "calibration/ece": 0.09158310536844073, "calibration/mean_confidence": 0.6857670514825813, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0019097222222222543, "completions/max_length": 3701.8, "completions/max_terminated_length": 3701.8, "completions/mean_length": 930.9134521484375, "completions/mean_terminated_length": 932.7838989257813, "completions/min_length": 0.0, "completions/min_terminated_length": 252.6, "epoch": 1.6695916301046236, "grad_norm": 0.0006645110552199185, "learning_rate": 2.073317307692308e-06, "loss": -0.0008, "num_tokens": 1779245340.0, "reward": 1.3017965793609618, "reward_std": 0.09792021811008453, "rewards/accuracy_reward": 0.7466145873069763, "rewards/brier_reward": 0.8588735938072205, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9980902791023254, "rewards/mean_confidence_reward": 0.7308376789093017, "signal/accuracy_reward/centered_abs_mean": 0.11240776926279068, "signal/accuracy_reward/group_std_mean": 0.14818194806575774, "signal/accuracy_reward/group_zero_std_frac": 0.5861111044883728, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05620388463139534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05620388463139534, "signal/advantage_abs_mean": 0.07154968529939651, "signal/advantage_pre_scale_abs_mean": 0.07154968529939651, "signal/advantage_pre_scale_std": 0.1484357163310051, "signal/advantage_std": 0.1484357163310051, "signal/brier_reward/centered_abs_mean": 0.06797878667712212, "signal/brier_reward/group_std_mean": 0.09093901962041855, "signal/brier_reward/group_zero_std_frac": 0.1333333358168602, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03398939333856106, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03398939333856106, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0034288193681277336, "signal/format_reward/group_std_mean": 0.0073366358410567045, "signal/format_reward/group_zero_std_frac": 0.9666666626930237, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0017144096840638668, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0017144096840638668, "signal/mean_confidence_reward/centered_abs_mean": 0.057592136412858964, "signal/mean_confidence_reward/group_std_mean": 0.07626924365758896, "signal/mean_confidence_reward/group_zero_std_frac": 0.14722222089767456, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.759213649980666e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.759213649980666e-07, "step": 695 }, { "calibration/aurc": 0.12793938701323504, "calibration/batch_distribution_entropy": 0.6247281317593819, "calibration/confidence_entropy": 0.4164691967240392, "calibration/coverage@0%": 0.2159609041438264, "calibration/coverage@1%": 0.3154400708104931, "calibration/coverage@10%": 0.5208224685476559, "calibration/coverage@15%": 0.5945517015706806, "calibration/coverage@20%": 0.745574280104712, "calibration/coverage@25%": 0.7591623036649214, "calibration/coverage@30%": 0.8754035776614311, "calibration/coverage@5%": 0.45087985540579334, "calibration/ece": 0.1619584086606154, "calibration/mean_confidence": 0.7240917812381812, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009548611111111383, "completions/max_length": 3507.4, "completions/max_terminated_length": 3507.4, "completions/mean_length": 916.8333251953125, "completions/mean_terminated_length": 917.7268432617187, "completions/min_length": 0.0, "completions/min_terminated_length": 212.8, "epoch": 1.6815914801064986, "grad_norm": 0.0004557875799946487, "learning_rate": 2.043269230769231e-06, "loss": 0.0017, "num_tokens": 1792927292.0, "reward": 1.2778399467468262, "reward_std": 0.09882068485021592, "rewards/accuracy_reward": 0.7193576335906983, "rewards/brier_reward": 0.8372628808021545, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9990451335906982, "rewards/mean_confidence_reward": 0.7092057347297669, "signal/accuracy_reward/centered_abs_mean": 0.10740559846162796, "signal/accuracy_reward/group_std_mean": 0.1522793859243393, "signal/accuracy_reward/group_zero_std_frac": 0.525, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05370279923081398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05370279923081398, "signal/advantage_abs_mean": 0.0689640335738659, "signal/advantage_pre_scale_abs_mean": 0.0689640335738659, "signal/advantage_pre_scale_std": 0.13824399411678315, "signal/advantage_std": 0.13824399411678315, "signal/brier_reward/centered_abs_mean": 0.06837288737297058, "signal/brier_reward/group_std_mean": 0.09447903037071229, "signal/brier_reward/group_zero_std_frac": 0.04722222331911326, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03418644368648529, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03418644368648529, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0018283419660292566, "signal/format_reward/group_std_mean": 0.004803628241643309, "signal/format_reward/group_zero_std_frac": 0.9749999880790711, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0009141709830146283, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0009141709830146283, "signal/mean_confidence_reward/centered_abs_mean": 0.06591395735740661, "signal/mean_confidence_reward/group_std_mean": 0.08698421269655228, "signal/mean_confidence_reward/group_zero_std_frac": 0.050000001676380634, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.591395504074171e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.591395504074171e-07, "step": 700 }, { "epoch": 1.6815914801064986, "eval_calibration/aurc": 0.1032289342686364, "eval_calibration/batch_distribution_entropy": 0.6335866632583236, "eval_calibration/confidence_entropy": 0.45924526008679173, "eval_calibration/coverage@0%": 0.24495967741935484, "eval_calibration/coverage@1%": 0.24495967741935484, "eval_calibration/coverage@10%": 0.6078629032258065, "eval_calibration/coverage@15%": 0.6970766129032259, "eval_calibration/coverage@20%": 0.8645833333333334, "eval_calibration/coverage@25%": 0.9635416666666666, "eval_calibration/coverage@30%": 1.0, "eval_calibration/coverage@5%": 0.37836021505376344, "eval_calibration/ece": 0.13638272849462366, "eval_calibration/mean_confidence": 0.6860467069892473, "eval_completions/clipped_ratio": 0.001736111111111105, "eval_completions/max_length": 3032.6666666666665, "eval_completions/max_terminated_length": 3032.6666666666665, "eval_completions/mean_length": 945.2144165039062, "eval_completions/mean_terminated_length": 946.8868713378906, "eval_completions/min_length": 230.0, "eval_completions/min_terminated_length": 257.6666666666667, "eval_loss": 0.0, "eval_num_tokens": 1792927292.0, "eval_reward": 1.2888956268628438, "eval_reward_std": 0.27547654012839, "eval_rewards/accuracy_reward": 0.7196180621782938, "eval_rewards/brier_reward": 0.8598958452542623, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9982638855775198, "eval_rewards/mean_confidence_reward": 0.6698784629503886, "eval_runtime": 173.2098, "eval_samples_per_second": 5.773, "eval_signal/accuracy_reward/centered_abs_mean": 0.3915473173062007, "eval_signal/accuracy_reward/group_std_mean": 0.44827142357826233, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19577365865310034, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.19577365865310034, "eval_signal/advantage_abs_mean": 0.23171568910280863, "eval_signal/advantage_pre_scale_abs_mean": 0.23171568910280863, "eval_signal/advantage_pre_scale_std": 0.27345553537209827, "eval_signal/advantage_std": 0.27345553537209827, "eval_signal/brier_reward/centered_abs_mean": 0.14941502610842386, "eval_signal/brier_reward/group_std_mean": 0.20023001482089361, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07470751305421193, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.07470751305421193, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.0033637151742974916, "eval_signal/format_reward/group_std_mean": 0.009820927555362383, "eval_signal/format_reward/group_zero_std_frac": 0.9444444477558136, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.23324380815029144, "eval_signal/mean_confidence_reward/group_std_mean": 0.26400945087273914, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.33243796780395e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.33243796780395e-06, "eval_steps_per_second": 0.035, "step": 700 }, { "epoch": 1.6815914801064986, "step": 700, "train_probe_calibration/aurc": 0.13447049760589894, "train_probe_calibration/batch_distribution_entropy": 0.6757262044443509, "train_probe_calibration/confidence_entropy": 0.47440274213469774, "train_probe_calibration/coverage@0%": 0.19791666666666666, "train_probe_calibration/coverage@1%": 0.19791666666666666, "train_probe_calibration/coverage@10%": 0.359375, "train_probe_calibration/coverage@15%": 0.6614583333333334, "train_probe_calibration/coverage@20%": 0.8125, "train_probe_calibration/coverage@25%": 0.921875, "train_probe_calibration/coverage@30%": 0.9479166666666666, "train_probe_calibration/coverage@5%": 0.24479166666666666, "train_probe_calibration/ece": 0.13359375, "train_probe_calibration/mean_confidence": 0.6705729166666666, "train_probe_completions/clipped_ratio": 0.0008680555555555617, "train_probe_completions/max_length": 3178.3333333333335, "train_probe_completions/max_terminated_length": 3178.3333333333335, "train_probe_completions/mean_length": 944.2076619466146, "train_probe_completions/mean_terminated_length": 945.0102742513021, "train_probe_completions/min_length": 215.66666666666666, "train_probe_completions/min_terminated_length": 249.83333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 1792927292.0, "train_probe_reward": 1.311933974424998, "train_probe_reward_std": 0.26249803105990094, "train_probe_rewards/accuracy_reward": 0.7560763955116272, "train_probe_rewards/brier_reward": 0.8686458468437195, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9991319477558136, "train_probe_rewards/mean_confidence_reward": 0.6874131759007772, "train_probe_runtime": 176.8811, "train_probe_samples_per_second": 5.654, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3607313384612401, "train_probe_signal/accuracy_reward/group_std_mean": 0.4296838740507762, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.18036566923062006, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.18036566923062006, "train_probe_signal/advantage_abs_mean": 0.21678911397854486, "train_probe_signal/advantage_pre_scale_abs_mean": 0.21678911397854486, "train_probe_signal/advantage_pre_scale_std": 0.2614838605125745, "train_probe_signal/advantage_std": 0.2614838605125745, "train_probe_signal/brier_reward/centered_abs_mean": 0.14130250861247381, "train_probe_signal/brier_reward/group_std_mean": 0.1884548415740331, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07065125430623691, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07065125430623691, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.0016818575871487458, "train_probe_signal/format_reward/group_std_mean": 0.0049104637776811915, "train_probe_signal/format_reward/group_zero_std_frac": 0.9722222288449606, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0008409287935743729, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.0008409287935743729, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.22668184340000153, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.2597619369626045, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 2.2668183419227717e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 2.2668183419227717e-06, "train_probe_steps_per_second": 0.034 }, { "calibration/aurc": 0.07701944484693024, "calibration/batch_distribution_entropy": 0.6093866688467507, "calibration/confidence_entropy": 0.4581012097322594, "calibration/coverage@0%": 0.26458469321148825, "calibration/coverage@1%": 0.29062635987815494, "calibration/coverage@10%": 0.759259410356832, "calibration/coverage@15%": 0.821316090078329, "calibration/coverage@20%": 0.8682291666666668, "calibration/coverage@25%": 0.9026041666666668, "calibration/coverage@30%": 0.9463541666666668, "calibration/coverage@5%": 0.47239719321148826, "calibration/ece": 0.13468444027415133, "calibration/mean_confidence": 0.7053850494995648, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0021701388888888838, "completions/max_length": 3854.0, "completions/max_terminated_length": 3854.0, "completions/mean_length": 978.4721435546875, "completions/mean_terminated_length": 980.5973754882813, "completions/min_length": 0.0, "completions/min_terminated_length": 231.8, "epoch": 1.6935913301083736, "grad_norm": 0.00047394505236297846, "learning_rate": 2.013221153846154e-06, "loss": -0.0009, "num_tokens": 1807285307.0, "reward": 1.2959699869155883, "reward_std": 0.09457662254571915, "rewards/accuracy_reward": 0.7342013955116272, "rewards/brier_reward": 0.8601558089256287, "rewards/confidence_one_or_zero": 0.00026041667442768814, "rewards/format_reward": 0.9975694417953491, "rewards/mean_confidence_reward": 0.6654687523841858, "signal/accuracy_reward/centered_abs_mean": 0.11292317658662795, "signal/accuracy_reward/group_std_mean": 0.15223421454429625, "signal/accuracy_reward/group_zero_std_frac": 0.5666666626930237, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05646158829331398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05646158829331398, "signal/advantage_abs_mean": 0.06792442798614502, "signal/advantage_pre_scale_abs_mean": 0.06792442798614502, "signal/advantage_pre_scale_std": 0.1366561993956566, "signal/advantage_std": 0.1366561993956566, "signal/brier_reward/centered_abs_mean": 0.06400572583079338, "signal/brier_reward/group_std_mean": 0.08666503131389618, "signal/brier_reward/group_zero_std_frac": 0.10000000335276127, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03200286291539669, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03200286291539669, "signal/confidence_one_or_zero/centered_abs_mean": 0.00047200522385537624, "signal/confidence_one_or_zero/group_std_mean": 0.0008226238191127777, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.7200515496115255e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.7200515496115255e-09, "signal/format_reward/centered_abs_mean": 0.004503038211259991, "signal/format_reward/group_std_mean": 0.01008768230676651, "signal/format_reward/group_zero_std_frac": 0.9527777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0022515191056299956, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0022515191056299956, "signal/mean_confidence_reward/centered_abs_mean": 0.06373020112514496, "signal/mean_confidence_reward/group_std_mean": 0.0837476372718811, "signal/mean_confidence_reward/group_zero_std_frac": 0.10000000335276127, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.373019800776092e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.373019800776092e-07, "step": 705 }, { "calibration/aurc": 0.11453615281428305, "calibration/batch_distribution_entropy": 0.7087894019863262, "calibration/confidence_entropy": 0.49413314237369105, "calibration/coverage@0%": 0.013572972343353428, "calibration/coverage@1%": 0.3672903452353287, "calibration/coverage@10%": 0.6083895796025682, "calibration/coverage@15%": 0.6750562462692348, "calibration/coverage@20%": 0.7601647759376468, "calibration/coverage@25%": 0.8838269691035684, "calibration/coverage@30%": 0.9125, "calibration/coverage@5%": 0.5113905743783578, "calibration/ece": 0.1072999526676964, "calibration/mean_confidence": 0.637247136708907, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001475694444444442, "completions/max_length": 3948.6, "completions/max_terminated_length": 3948.6, "completions/mean_length": 984.5948974609375, "completions/mean_terminated_length": 986.046875, "completions/min_length": 48.0, "completions/min_terminated_length": 232.0, "epoch": 1.7055911801102486, "grad_norm": 0.0003736707440111786, "learning_rate": 1.983173076923077e-06, "loss": 0.0001, "num_tokens": 1821761184.0, "reward": 1.3089587211608886, "reward_std": 0.09990472048521042, "rewards/accuracy_reward": 0.7460069417953491, "rewards/brier_reward": 0.8733721613883972, "rewards/confidence_one_or_zero": 0.00026041667442768814, "rewards/format_reward": 0.998524296283722, "rewards/mean_confidence_reward": 0.7020269155502319, "signal/accuracy_reward/centered_abs_mean": 0.1029296875, "signal/accuracy_reward/group_std_mean": 0.1483136534690857, "signal/accuracy_reward/group_zero_std_frac": 0.5305555641651154, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05146484375, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05146484375, "signal/advantage_abs_mean": 0.06866423338651657, "signal/advantage_pre_scale_abs_mean": 0.06866423338651657, "signal/advantage_pre_scale_std": 0.1432929664850235, "signal/advantage_std": 0.1432929664850235, "signal/brier_reward/centered_abs_mean": 0.059538546949625015, "signal/brier_reward/group_std_mean": 0.08378432393074035, "signal/brier_reward/group_zero_std_frac": 0.13333333432674407, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.029769273474812508, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.029769273474812508, "signal/confidence_one_or_zero/centered_abs_mean": 0.00047200522385537624, "signal/confidence_one_or_zero/group_std_mean": 0.0008226238191127777, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.720051904882894e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.720051904882894e-09, "signal/format_reward/centered_abs_mean": 0.002739800396375358, "signal/format_reward/group_std_mean": 0.006068569049239159, "signal/format_reward/group_zero_std_frac": 0.9722222089767456, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001369900198187679, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001369900198187679, "signal/mean_confidence_reward/centered_abs_mean": 0.06110650151968002, "signal/mean_confidence_reward/group_std_mean": 0.0808556079864502, "signal/mean_confidence_reward/group_zero_std_frac": 0.14444444477558135, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.110649678703339e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.110649678703339e-07, "step": 710 }, { "calibration/aurc": 0.16485441792318775, "calibration/batch_distribution_entropy": 0.5250986502336006, "calibration/confidence_entropy": 0.4342079996831343, "calibration/coverage@0%": 0.017771051483420595, "calibration/coverage@1%": 0.1974585514834206, "calibration/coverage@10%": 0.34381271815008724, "calibration/coverage@15%": 0.5955033813263525, "calibration/coverage@20%": 0.6476930628272252, "calibration/coverage@25%": 0.7914430628272251, "calibration/coverage@30%": 0.8233229712041885, "calibration/coverage@5%": 0.34381271815008724, "calibration/ece": 0.153591977530541, "calibration/mean_confidence": 0.7599286921902271, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001822916666666652, "completions/max_length": 3909.0, "completions/max_terminated_length": 3909.0, "completions/mean_length": 999.7485107421875, "completions/mean_terminated_length": 1001.5943725585937, "completions/min_length": 0.0, "completions/min_terminated_length": 242.2, "epoch": 1.7175910301121236, "grad_norm": 0.0004361154278740287, "learning_rate": 1.953125e-06, "loss": 0.0002, "num_tokens": 1836425871.0, "reward": 1.2946624279022216, "reward_std": 0.09649095088243484, "rewards/accuracy_reward": 0.7265625, "rewards/brier_reward": 0.8645705342292785, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9981770753860474, "rewards/mean_confidence_reward": 0.7353255271911621, "signal/accuracy_reward/centered_abs_mean": 0.09596354216337204, "signal/accuracy_reward/group_std_mean": 0.13301910758018493, "signal/accuracy_reward/group_zero_std_frac": 0.5972222328186035, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04798177108168602, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04798177108168602, "signal/advantage_abs_mean": 0.06781797409057617, "signal/advantage_pre_scale_abs_mean": 0.06781797409057617, "signal/advantage_pre_scale_std": 0.14711880087852477, "signal/advantage_std": 0.14711880087852477, "signal/brier_reward/centered_abs_mean": 0.05979919284582138, "signal/brier_reward/group_std_mean": 0.08265522271394729, "signal/brier_reward/group_zero_std_frac": 0.1694444462656975, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.02989959642291069, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.02989959642291069, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.003412543493323028, "signal/format_reward/group_std_mean": 0.0077864469960331915, "signal/format_reward/group_zero_std_frac": 0.9638888835906982, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001706271746661514, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001706271746661514, "signal/mean_confidence_reward/centered_abs_mean": 0.055402564257383345, "signal/mean_confidence_reward/group_std_mean": 0.0736262783408165, "signal/mean_confidence_reward/group_zero_std_frac": 0.17777777910232545, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.540256154290546e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.540256154290546e-07, "step": 715 }, { "calibration/aurc": 0.1545213158643791, "calibration/batch_distribution_entropy": 0.5090792973043831, "calibration/confidence_entropy": 0.4209588919077111, "calibration/coverage@0%": 0.07201525968404121, "calibration/coverage@1%": 0.07201525968404121, "calibration/coverage@10%": 0.47174529895105693, "calibration/coverage@15%": 0.5229833006962576, "calibration/coverage@20%": 0.6701615535248042, "calibration/coverage@25%": 0.8235734878154917, "calibration/coverage@30%": 0.880462630548303, "calibration/coverage@5%": 0.21232939580969562, "calibration/ece": 0.13001617970213117, "calibration/mean_confidence": 0.7799443824074201, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0018229166666666962, "completions/max_length": 3669.4, "completions/max_terminated_length": 3669.4, "completions/mean_length": 1039.4793579101563, "completions/mean_terminated_length": 1041.4698120117187, "completions/min_length": 0.0, "completions/min_terminated_length": 276.6, "epoch": 1.7295908801139985, "grad_norm": 0.0006103924824856222, "learning_rate": 1.9230769230769234e-06, "loss": -0.0002, "num_tokens": 1851507041.0, "reward": 1.2863576412200928, "reward_std": 0.130401174724102, "rewards/accuracy_reward": 0.7230902791023255, "rewards/brier_reward": 0.8514327049255371, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9981770873069763, "rewards/mean_confidence_reward": 0.7667690753936768, "signal/accuracy_reward/centered_abs_mean": 0.1353190079331398, "signal/accuracy_reward/group_std_mean": 0.17822936773300171, "signal/accuracy_reward/group_zero_std_frac": 0.4888889014720917, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0676595039665699, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0676595039665699, "signal/advantage_abs_mean": 0.09637026190757751, "signal/advantage_pre_scale_abs_mean": 0.09637026190757751, "signal/advantage_pre_scale_std": 0.18330829739570617, "signal/advantage_std": 0.18330829739570617, "signal/brier_reward/centered_abs_mean": 0.0779776081442833, "signal/brier_reward/group_std_mean": 0.10611708462238312, "signal/brier_reward/group_zero_std_frac": 0.1250000014901161, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03898880407214165, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03898880407214165, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.003412543318700045, "signal/format_reward/group_std_mean": 0.008032754249870776, "signal/format_reward/group_zero_std_frac": 0.9611111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0017062716593500226, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0017062716593500226, "signal/mean_confidence_reward/centered_abs_mean": 0.060709098726511, "signal/mean_confidence_reward/group_std_mean": 0.07999620288610458, "signal/mean_confidence_reward/group_zero_std_frac": 0.14722222238779067, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.070909648769884e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.070909648769884e-07, "step": 720 }, { "calibration/aurc": 0.06706318162214964, "calibration/batch_distribution_entropy": 0.45206136414036235, "calibration/confidence_entropy": 0.4082263181729656, "calibration/coverage@0%": 0.10377774151436032, "calibration/coverage@1%": 0.20821638381201044, "calibration/coverage@10%": 0.75287886205396, "calibration/coverage@15%": 0.8535316035683204, "calibration/coverage@20%": 0.9155542863359443, "calibration/coverage@25%": 1.0, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.5702281875543951, "calibration/ece": 0.09783160628807645, "calibration/mean_confidence": 0.7937404128590079, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002430555555555558, "completions/max_length": 3883.2, "completions/max_terminated_length": 3883.2, "completions/mean_length": 1009.53974609375, "completions/mean_terminated_length": 1012.075732421875, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 1.7415907301158735, "grad_norm": 0.0005445472779683769, "learning_rate": 1.8930288461538463e-06, "loss": -0.0011, "num_tokens": 1866237835.0, "reward": 1.305838656425476, "reward_std": 0.11688112020492554, "rewards/accuracy_reward": 0.7559027791023254, "rewards/brier_reward": 0.858189058303833, "rewards/confidence_one_or_zero": 0.0003472222248092294, "rewards/format_reward": 0.9975694417953491, "rewards/mean_confidence_reward": 0.8031406044960022, "signal/accuracy_reward/centered_abs_mean": 0.1146050363779068, "signal/accuracy_reward/group_std_mean": 0.153210586309433, "signal/accuracy_reward/group_zero_std_frac": 0.5527777910232544, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0573025181889534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0573025181889534, "signal/advantage_abs_mean": 0.0854413241147995, "signal/advantage_pre_scale_abs_mean": 0.0854413241147995, "signal/advantage_pre_scale_std": 0.17521314322948456, "signal/advantage_std": 0.17521314322948456, "signal/brier_reward/centered_abs_mean": 0.07106678932905197, "signal/brier_reward/group_std_mean": 0.0976751372218132, "signal/brier_reward/group_zero_std_frac": 0.1361111134290695, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.035533394664525986, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.035533394664525986, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006727430503815412, "signal/confidence_one_or_zero/group_std_mean": 0.0019641853868961334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9888888835906983, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/format_reward/centered_abs_mean": 0.004492187488358468, "signal/format_reward/group_std_mean": 0.009822000749409199, "signal/format_reward/group_zero_std_frac": 0.9555555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002246093744179234, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002246093744179234, "signal/mean_confidence_reward/centered_abs_mean": 0.05268317088484764, "signal/mean_confidence_reward/group_std_mean": 0.07052040547132492, "signal/mean_confidence_reward/group_zero_std_frac": 0.14444444477558135, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.268316954243346e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.268316954243346e-07, "step": 725 }, { "calibration/aurc": 0.12155745144364072, "calibration/batch_distribution_entropy": 0.48631310133138406, "calibration/confidence_entropy": 0.4217766848929708, "calibration/coverage@0%": 0.05642978870768571, "calibration/coverage@1%": 0.05642978870768571, "calibration/coverage@10%": 0.556475262748395, "calibration/coverage@15%": 0.6025253222128052, "calibration/coverage@20%": 0.678793241277414, "calibration/coverage@25%": 0.8841746280056867, "calibration/coverage@30%": 0.9703097731239094, "calibration/coverage@5%": 0.3978952717250147, "calibration/ece": 0.12807002674759288, "calibration/mean_confidence": 0.7784683606903796, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0014756944444444641, "completions/max_length": 3773.6, "completions/max_terminated_length": 3773.6, "completions/mean_length": 1056.66416015625, "completions/mean_terminated_length": 1058.3031860351562, "completions/min_length": 0.0, "completions/min_terminated_length": 260.4, "epoch": 1.7535905801177485, "grad_norm": 0.0005527407047338784, "learning_rate": 1.8629807692307695e-06, "loss": -0.0005, "num_tokens": 1881516174.0, "reward": 1.2720434665679932, "reward_std": 0.11679658740758896, "rewards/accuracy_reward": 0.7072048664093018, "rewards/brier_reward": 0.8383417367935181, "rewards/confidence_one_or_zero": 0.0019965277169831097, "rewards/format_reward": 0.998524296283722, "rewards/mean_confidence_reward": 0.7970329761505127, "signal/accuracy_reward/centered_abs_mean": 0.1127875417470932, "signal/accuracy_reward/group_std_mean": 0.14582538306713105, "signal/accuracy_reward/group_zero_std_frac": 0.5972222328186035, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0563937708735466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0563937708735466, "signal/advantage_abs_mean": 0.08945094943046569, "signal/advantage_pre_scale_abs_mean": 0.08945094943046569, "signal/advantage_pre_scale_std": 0.1786717414855957, "signal/advantage_std": 0.1786717414855957, "signal/brier_reward/centered_abs_mean": 0.07774122953414916, "signal/brier_reward/group_std_mean": 0.10102987438440322, "signal/brier_reward/group_zero_std_frac": 0.14166666865348815, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03887061476707458, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03887061476707458, "signal/confidence_one_or_zero/centered_abs_mean": 0.0015896267024800182, "signal/confidence_one_or_zero/group_std_mean": 0.0023225335404276847, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.5896266347681377e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.5896266347681377e-08, "signal/format_reward/centered_abs_mean": 0.0027940537431277333, "signal/format_reward/group_std_mean": 0.006800450198352337, "signal/format_reward/group_zero_std_frac": 0.9666666626930237, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0013970268715638666, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0013970268715638666, "signal/mean_confidence_reward/centered_abs_mean": 0.052974996715784074, "signal/mean_confidence_reward/group_std_mean": 0.06953665986657143, "signal/mean_confidence_reward/group_zero_std_frac": 0.15555555522441863, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.297499512835202e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.297499512835202e-07, "step": 730 }, { "calibration/aurc": 0.1882357182829569, "calibration/batch_distribution_entropy": 0.5019809932499741, "calibration/confidence_entropy": 0.4160138572118915, "calibration/coverage@0%": 0.03134530595090991, "calibration/coverage@1%": 0.03134530595090991, "calibration/coverage@10%": 0.3234522093734435, "calibration/coverage@15%": 0.4500186167805958, "calibration/coverage@20%": 0.63010167326679, "calibration/coverage@25%": 0.6926291290957143, "calibration/coverage@30%": 0.7764397905759163, "calibration/coverage@5%": 0.14887158648622628, "calibration/ece": 0.1403794814760268, "calibration/mean_confidence": 0.7810818977581938, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003385416666666674, "completions/max_length": 3924.4, "completions/max_terminated_length": 3924.4, "completions/mean_length": 1033.9052001953125, "completions/mean_terminated_length": 1037.4953491210938, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 1.7655904301196235, "grad_norm": 0.0006181937060318887, "learning_rate": 1.8329326923076924e-06, "loss": -0.0028, "num_tokens": 1896527882.0, "reward": 1.2813000917434691, "reward_std": 0.11848197728395463, "rewards/accuracy_reward": 0.7201388835906982, "rewards/brier_reward": 0.8458308815956116, "rewards/confidence_one_or_zero": 0.00034722223062999547, "rewards/format_reward": 0.9966145873069763, "rewards/mean_confidence_reward": 0.790285587310791, "signal/accuracy_reward/centered_abs_mean": 0.1119900181889534, "signal/accuracy_reward/group_std_mean": 0.15012822449207305, "signal/accuracy_reward/group_zero_std_frac": 0.5638888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0559950090944767, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0559950090944767, "signal/advantage_abs_mean": 0.08606994301080703, "signal/advantage_pre_scale_abs_mean": 0.08606994301080703, "signal/advantage_pre_scale_std": 0.1788394957780838, "signal/advantage_std": 0.1788394957780838, "signal/brier_reward/centered_abs_mean": 0.07389550507068635, "signal/brier_reward/group_std_mean": 0.10071228742599488, "signal/brier_reward/group_zero_std_frac": 0.12777778059244155, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03694775253534317, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03694775253534317, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006618923391215504, "signal/confidence_one_or_zero/group_std_mean": 0.0016652446240186692, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.618923364953844e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.618923364953844e-09, "signal/format_reward/centered_abs_mean": 0.005300564295612276, "signal/format_reward/group_std_mean": 0.01066794004291296, "signal/format_reward/group_zero_std_frac": 0.9527777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002650282147806138, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002650282147806138, "signal/mean_confidence_reward/centered_abs_mean": 0.049408917874097825, "signal/mean_confidence_reward/group_std_mean": 0.06539798825979233, "signal/mean_confidence_reward/group_zero_std_frac": 0.15555555522441863, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.940891585647478e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.940891585647478e-07, "step": 735 }, { "calibration/aurc": 0.12665159301844126, "calibration/batch_distribution_entropy": 0.3435448822782101, "calibration/confidence_entropy": 0.37428605680815175, "calibration/coverage@0%": 0.009920311140121844, "calibration/coverage@1%": 0.009920311140121844, "calibration/coverage@10%": 0.38883947998259355, "calibration/coverage@15%": 0.6726936466492603, "calibration/coverage@20%": 0.7258934399477807, "calibration/coverage@25%": 0.9431122171453439, "calibration/coverage@30%": 0.9738903394255874, "calibration/coverage@5%": 0.2672119778067885, "calibration/ece": 0.0911618119016537, "calibration/mean_confidence": 0.8336456293516102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001649305555555558, "completions/max_length": 3748.2, "completions/max_terminated_length": 3748.2, "completions/mean_length": 976.6124145507813, "completions/mean_terminated_length": 978.2357543945312, "completions/min_length": 0.0, "completions/min_terminated_length": 276.4, "epoch": 1.7775902801214984, "grad_norm": 0.0005833052564412355, "learning_rate": 1.8028846153846156e-06, "loss": -0.0004, "num_tokens": 1910837849.0, "reward": 1.3210662364959718, "reward_std": 0.11231956481933594, "rewards/accuracy_reward": 0.771006953716278, "rewards/brier_reward": 0.8728455543518067, "rewards/confidence_one_or_zero": 0.00034722223062999547, "rewards/format_reward": 0.9982638835906983, "rewards/mean_confidence_reward": 0.8073420047760009, "signal/accuracy_reward/centered_abs_mean": 0.1018771693110466, "signal/accuracy_reward/group_std_mean": 0.14028796702623367, "signal/accuracy_reward/group_zero_std_frac": 0.5833333492279053, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0509385846555233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0509385846555233, "signal/advantage_abs_mean": 0.07985040321946144, "signal/advantage_pre_scale_abs_mean": 0.07985040321946144, "signal/advantage_pre_scale_std": 0.17008887231349945, "signal/advantage_std": 0.17008887231349945, "signal/brier_reward/centered_abs_mean": 0.06809019893407822, "signal/brier_reward/group_std_mean": 0.09482295513153076, "signal/brier_reward/group_zero_std_frac": 0.11666666641831398, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03404509946703911, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03404509946703911, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006618923391215504, "signal/confidence_one_or_zero/group_std_mean": 0.0016652446240186692, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.618923009682476e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.618923009682476e-09, "signal/format_reward/centered_abs_mean": 0.0032769097131676974, "signal/format_reward/group_std_mean": 0.007922015152871609, "signal/format_reward/group_zero_std_frac": 0.9611111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0016384548565838487, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0016384548565838487, "signal/mean_confidence_reward/centered_abs_mean": 0.0496798574924469, "signal/mean_confidence_reward/group_std_mean": 0.06837576255202293, "signal/mean_confidence_reward/group_zero_std_frac": 0.13333333283662796, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.967985546500131e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.967985546500131e-07, "step": 740 }, { "calibration/aurc": 0.08538492566604036, "calibration/batch_distribution_entropy": 0.44794952605921, "calibration/confidence_entropy": 0.4109114262975684, "calibration/coverage@0%": 0.17552083333333335, "calibration/coverage@1%": 0.19739583333333335, "calibration/coverage@10%": 0.6270833333333334, "calibration/coverage@15%": 0.8130208333333332, "calibration/coverage@20%": 0.8791666666666667, "calibration/coverage@25%": 0.9010416666666666, "calibration/coverage@30%": 0.9723958333333332, "calibration/coverage@5%": 0.4588541666666667, "calibration/ece": 0.10351562499999997, "calibration/mean_confidence": 0.7960156250000001, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002430555555555558, "completions/max_length": 4050.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 1023.4842041015625, "completions/mean_terminated_length": 1026.0055297851563, "completions/min_length": 55.8, "completions/min_terminated_length": 238.2, "epoch": 1.7895901301233734, "grad_norm": 0.0005134017555974424, "learning_rate": 1.7728365384615387e-06, "loss": -0.0022, "num_tokens": 1925746883.0, "reward": 1.2950168609619142, "reward_std": 0.1210503026843071, "rewards/accuracy_reward": 0.7361111044883728, "rewards/brier_reward": 0.8563374638557434, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9975694417953491, "rewards/mean_confidence_reward": 0.7908628344535827, "signal/accuracy_reward/centered_abs_mean": 0.11706814169883728, "signal/accuracy_reward/group_std_mean": 0.15721434950828553, "signal/accuracy_reward/group_zero_std_frac": 0.5444444715976715, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05853407084941864, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05853407084941864, "signal/advantage_abs_mean": 0.08854459524154663, "signal/advantage_pre_scale_abs_mean": 0.08854459524154663, "signal/advantage_pre_scale_std": 0.17863909304141998, "signal/advantage_std": 0.17863909304141998, "signal/brier_reward/centered_abs_mean": 0.0743595078587532, "signal/brier_reward/group_std_mean": 0.10044341385364533, "signal/brier_reward/group_zero_std_frac": 0.11388889253139496, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0371797539293766, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0371797539293766, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.004123263992369175, "signal/format_reward/group_std_mean": 0.007523969188332558, "signal/format_reward/group_zero_std_frac": 0.9694444298744201, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0020616319961845875, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0020616319961845875, "signal/mean_confidence_reward/centered_abs_mean": 0.05387929156422615, "signal/mean_confidence_reward/group_std_mean": 0.07014289796352387, "signal/mean_confidence_reward/group_zero_std_frac": 0.1250000014901161, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.387929263633851e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.387929263633851e-07, "step": 745 }, { "calibration/aurc": 0.1278039490321436, "calibration/batch_distribution_entropy": 0.5371869929104625, "calibration/confidence_entropy": 0.4347890980759809, "calibration/coverage@0%": 0.02879581151832461, "calibration/coverage@1%": 0.02879581151832461, "calibration/coverage@10%": 0.4363348051344443, "calibration/coverage@15%": 0.663458778177245, "calibration/coverage@20%": 0.7633617213238008, "calibration/coverage@25%": 0.9430982766143107, "calibration/coverage@30%": 0.9592441099476439, "calibration/coverage@5%": 0.2504709307888945, "calibration/ece": 0.10111902344230594, "calibration/mean_confidence": 0.7758617316474832, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002690972222222232, "completions/max_length": 3790.2, "completions/max_terminated_length": 3790.2, "completions/mean_length": 1014.7631958007812, "completions/mean_terminated_length": 1017.596630859375, "completions/min_length": 0.0, "completions/min_terminated_length": 260.6, "epoch": 1.8015899801252484, "grad_norm": 0.00047058731433935463, "learning_rate": 1.7427884615384616e-06, "loss": -0.0018, "num_tokens": 1940549083.0, "reward": 1.3231704235076904, "reward_std": 0.11318290382623672, "rewards/accuracy_reward": 0.7697048544883728, "rewards/brier_reward": 0.8793113827705383, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973090410232544, "rewards/mean_confidence_reward": 0.7846727490425109, "signal/accuracy_reward/centered_abs_mean": 0.10029839426279068, "signal/accuracy_reward/group_std_mean": 0.1436718672513962, "signal/accuracy_reward/group_zero_std_frac": 0.5444444477558136, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05014919713139534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05014919713139534, "signal/advantage_abs_mean": 0.0778187520802021, "signal/advantage_pre_scale_abs_mean": 0.0778187520802021, "signal/advantage_pre_scale_std": 0.1671304076910019, "signal/advantage_std": 0.1671304076910019, "signal/brier_reward/centered_abs_mean": 0.06822900474071503, "signal/brier_reward/group_std_mean": 0.09658293426036835, "signal/brier_reward/group_zero_std_frac": 0.14166666865348815, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03411450237035751, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03411450237035751, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.004833984281867742, "signal/format_reward/group_std_mean": 0.010325549356639385, "signal/format_reward/group_zero_std_frac": 0.9527777791023254, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002416992140933871, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002416992140933871, "signal/mean_confidence_reward/centered_abs_mean": 0.04984749779105187, "signal/mean_confidence_reward/group_std_mean": 0.06611247733235359, "signal/mean_confidence_reward/group_zero_std_frac": 0.17222222089767455, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.984749693903723e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.984749693903723e-07, "step": 750 }, { "epoch": 1.8015899801252484, "eval_calibration/aurc": 0.12317924866010717, "eval_calibration/batch_distribution_entropy": 0.5135325467008348, "eval_calibration/confidence_entropy": 0.41996716710052984, "eval_calibration/coverage@0%": 0.22916666666666666, "eval_calibration/coverage@1%": 0.22916666666666666, "eval_calibration/coverage@10%": 0.4635416666666667, "eval_calibration/coverage@15%": 0.6614583333333334, "eval_calibration/coverage@20%": 0.7864583333333334, "eval_calibration/coverage@25%": 0.8958333333333334, "eval_calibration/coverage@30%": 0.9791666666666666, "eval_calibration/coverage@5%": 0.3645833333333333, "eval_calibration/ece": 0.1395833333333333, "eval_calibration/mean_confidence": 0.7848958333333335, "eval_completions/clipped_ratio": 0.0008680555555555617, "eval_completions/max_length": 3015.5, "eval_completions/max_terminated_length": 3015.5, "eval_completions/mean_length": 983.9092203776041, "eval_completions/mean_terminated_length": 984.7552083333334, "eval_completions/min_length": 232.5, "eval_completions/min_terminated_length": 280.8333333333333, "eval_loss": 0.0, "eval_num_tokens": 1940549083.0, "eval_reward": 1.276227315266927, "eval_reward_std": 0.32471097509066266, "eval_rewards/accuracy_reward": 0.7083333333333334, "eval_rewards/brier_reward": 0.8449739714463552, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9991319477558136, "eval_rewards/mean_confidence_reward": 0.7688367664813995, "eval_runtime": 172.3304, "eval_samples_per_second": 5.803, "eval_signal/accuracy_reward/centered_abs_mean": 0.4028862863779068, "eval_signal/accuracy_reward/group_std_mean": 0.4548947314421336, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.2014431431889534, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2014431431889534, "eval_signal/advantage_abs_mean": 0.2793968617916107, "eval_signal/advantage_pre_scale_abs_mean": 0.2793968617916107, "eval_signal/advantage_pre_scale_std": 0.3216147869825363, "eval_signal/advantage_std": 0.3216147869825363, "eval_signal/brier_reward/centered_abs_mean": 0.18342354396979013, "eval_signal/brier_reward/group_std_mean": 0.2393630916873614, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.09171177198489507, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.09171177198489507, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.0016818575871487458, "eval_signal/format_reward/group_std_mean": 0.0049104637776811915, "eval_signal/format_reward/group_zero_std_frac": 0.9722222288449606, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0008409287935743729, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0008409287935743729, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.15755477299292883, "eval_signal/mean_confidence_reward/group_std_mean": 0.203289233148098, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.5755477041542083e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.5755477041542083e-06, "eval_steps_per_second": 0.035, "step": 750 }, { "epoch": 1.8015899801252484, "step": 750, "train_probe_calibration/aurc": 0.11870684137095111, "train_probe_calibration/batch_distribution_entropy": 0.517675201030707, "train_probe_calibration/confidence_entropy": 0.4313748359564881, "train_probe_calibration/coverage@0%": 0.015625, "train_probe_calibration/coverage@1%": 0.015625, "train_probe_calibration/coverage@10%": 0.5364583333333334, "train_probe_calibration/coverage@15%": 0.734375, "train_probe_calibration/coverage@20%": 0.8385416666666666, "train_probe_calibration/coverage@25%": 0.9479166666666666, "train_probe_calibration/coverage@30%": 0.9791666666666666, "train_probe_calibration/coverage@5%": 0.3385416666666667, "train_probe_calibration/ece": 0.13281250000000006, "train_probe_calibration/mean_confidence": 0.7708333333333334, "train_probe_completions/clipped_ratio": 0.0008680555555555617, "train_probe_completions/max_length": 2969.8333333333335, "train_probe_completions/max_terminated_length": 2969.8333333333335, "train_probe_completions/mean_length": 984.3861389160156, "train_probe_completions/mean_terminated_length": 985.1965026855469, "train_probe_completions/min_length": 203.33333333333334, "train_probe_completions/min_terminated_length": 243.83333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 1940549083.0, "train_probe_reward": 1.325800855954488, "train_probe_reward_std": 0.290613720814387, "train_probe_rewards/accuracy_reward": 0.777777781089147, "train_probe_rewards/brier_reward": 0.8755443493525187, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9982638955116272, "train_probe_rewards/mean_confidence_reward": 0.7794530987739563, "train_probe_runtime": 180.2196, "train_probe_samples_per_second": 5.549, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3359375049670537, "train_probe_signal/accuracy_reward/group_std_mean": 0.413636455933253, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.16796875248352686, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.16796875248352686, "train_probe_signal/advantage_abs_mean": 0.23145520190397897, "train_probe_signal/advantage_pre_scale_abs_mean": 0.23145520190397897, "train_probe_signal/advantage_pre_scale_std": 0.29089683791001636, "train_probe_signal/advantage_std": 0.29089683791001636, "train_probe_signal/brier_reward/centered_abs_mean": 0.15131008376677832, "train_probe_signal/brier_reward/group_std_mean": 0.20802813271681467, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07565504188338916, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07565504188338916, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.0033637151742974916, "train_probe_signal/format_reward/group_std_mean": 0.009820927555362383, "train_probe_signal/format_reward/group_zero_std_frac": 0.944444457689921, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0016818575871487458, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.0016818575871487458, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.15226343274116516, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.20152652511994043, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.52263426874318e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.52263426874318e-06, "train_probe_steps_per_second": 0.033 }, { "calibration/aurc": 0.13895048336614158, "calibration/batch_distribution_entropy": 0.5396468030274055, "calibration/confidence_entropy": 0.4417391113879436, "calibration/coverage@0%": 0.034379101049868764, "calibration/coverage@1%": 0.034379101049868764, "calibration/coverage@10%": 0.4585999015748031, "calibration/coverage@15%": 0.5949187992125984, "calibration/coverage@20%": 0.7332718175853018, "calibration/coverage@25%": 0.9079683398950131, "calibration/coverage@30%": 0.965469160104987, "calibration/coverage@5%": 0.20130823490813646, "calibration/ece": 0.1332658710629921, "calibration/mean_confidence": 0.7747664452099738, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00199652777777779, "completions/max_length": 3798.6, "completions/max_terminated_length": 3798.6, "completions/mean_length": 972.9678955078125, "completions/mean_terminated_length": 974.9191650390625, "completions/min_length": 0.0, "completions/min_terminated_length": 250.4, "epoch": 1.8135898301271234, "grad_norm": 0.0005308398394845426, "learning_rate": 1.7127403846153848e-06, "loss": -0.0009, "num_tokens": 1954848969.0, "reward": 1.3073286533355712, "reward_std": 0.10729158222675324, "rewards/accuracy_reward": 0.7454861164093017, "rewards/brier_reward": 0.8711525678634644, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9980034708976746, "rewards/mean_confidence_reward": 0.755664050579071, "signal/accuracy_reward/centered_abs_mean": 0.10810546949505806, "signal/accuracy_reward/group_std_mean": 0.14585590958595276, "signal/accuracy_reward/group_zero_std_frac": 0.574999988079071, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05405273474752903, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05405273474752903, "signal/advantage_abs_mean": 0.07904635146260261, "signal/advantage_pre_scale_abs_mean": 0.07904635146260261, "signal/advantage_pre_scale_std": 0.16033744663000107, "signal/advantage_std": 0.16033744663000107, "signal/brier_reward/centered_abs_mean": 0.06547573283314705, "signal/brier_reward/group_std_mean": 0.08704073578119279, "signal/brier_reward/group_zero_std_frac": 0.15555555671453475, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03273786641657352, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03273786641657352, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.003597005154006183, "signal/format_reward/group_std_mean": 0.007198850810527802, "signal/format_reward/group_zero_std_frac": 0.9694444298744201, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0017985025770030916, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0017985025770030916, "signal/mean_confidence_reward/centered_abs_mean": 0.05586262941360474, "signal/mean_confidence_reward/group_std_mean": 0.07410713732242584, "signal/mean_confidence_reward/group_zero_std_frac": 0.15833333283662795, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.586263000623149e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.586263000623149e-07, "step": 755 }, { "calibration/aurc": 0.10768595617205876, "calibration/batch_distribution_entropy": 0.5730426211348049, "calibration/confidence_entropy": 0.48555160342206793, "calibration/coverage@0%": 0.22576118289748887, "calibration/coverage@1%": 0.2503042638374366, "calibration/coverage@10%": 0.5637561358271932, "calibration/coverage@15%": 0.608179467747962, "calibration/coverage@20%": 0.7774174717714926, "calibration/coverage@25%": 0.8669093586387435, "calibration/coverage@30%": 0.9520833333333334, "calibration/coverage@5%": 0.40814255880369454, "calibration/ece": 0.11244764575900276, "calibration/mean_confidence": 0.7402468040015447, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002690972222222232, "completions/max_length": 3930.0, "completions/max_terminated_length": 3930.0, "completions/mean_length": 951.5751586914063, "completions/mean_terminated_length": 954.197021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 222.4, "epoch": 1.8255896801289984, "grad_norm": 0.00044432529830373824, "learning_rate": 1.682692307692308e-06, "loss": -0.002, "num_tokens": 1968881707.0, "reward": 1.3183253049850463, "reward_std": 0.09663136750459671, "rewards/accuracy_reward": 0.7651909708976745, "rewards/brier_reward": 0.8742226481437683, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9972222208976745, "rewards/mean_confidence_reward": 0.7355381965637207, "signal/accuracy_reward/centered_abs_mean": 0.10411784052848816, "signal/accuracy_reward/group_std_mean": 0.13734591603279114, "signal/accuracy_reward/group_zero_std_frac": 0.6027777671813965, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05205892026424408, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05205892026424408, "signal/advantage_abs_mean": 0.0707747146487236, "signal/advantage_pre_scale_abs_mean": 0.0707747146487236, "signal/advantage_pre_scale_std": 0.14913327991962433, "signal/advantage_std": 0.14913327991962433, "signal/brier_reward/centered_abs_mean": 0.05873942822217941, "signal/brier_reward/group_std_mean": 0.07941988706588746, "signal/brier_reward/group_zero_std_frac": 0.12777777910232543, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.029369714111089705, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.029369714111089705, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0047309027751907705, "signal/format_reward/group_std_mean": 0.009351404011249542, "signal/format_reward/group_zero_std_frac": 0.9583333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0023654513875953852, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0023654513875953852, "signal/mean_confidence_reward/centered_abs_mean": 0.053496094048023225, "signal/mean_confidence_reward/group_std_mean": 0.07083378508687019, "signal/mean_confidence_reward/group_zero_std_frac": 0.13055555671453475, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.349609523364051e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.349609523364051e-07, "step": 760 }, { "calibration/aurc": 0.10523545580714518, "calibration/batch_distribution_entropy": 0.6208780079114519, "calibration/confidence_entropy": 0.479588044470991, "calibration/coverage@0%": 0.27635964912280697, "calibration/coverage@1%": 0.35004385964912277, "calibration/coverage@10%": 0.5101370614035088, "calibration/coverage@15%": 0.5635416666666666, "calibration/coverage@20%": 0.8723958333333333, "calibration/coverage@25%": 0.9958333333333332, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.4214692982456141, "calibration/ece": 0.14023382675438592, "calibration/mean_confidence": 0.7443253837719299, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 3583.8, "completions/max_terminated_length": 3583.8, "completions/mean_length": 1027.271533203125, "completions/mean_terminated_length": 1028.0674682617187, "completions/min_length": 64.0, "completions/min_terminated_length": 258.4, "epoch": 1.8375895301308733, "grad_norm": 0.0003903246542904526, "learning_rate": 1.6526442307692309e-06, "loss": 0.0012, "num_tokens": 1983828291.0, "reward": 1.2966579914093017, "reward_std": 0.09648162871599197, "rewards/accuracy_reward": 0.7420138955116272, "rewards/brier_reward": 0.8520694494247436, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999218738079071, "rewards/mean_confidence_reward": 0.6925086498260498, "signal/accuracy_reward/centered_abs_mean": 0.10745442807674407, "signal/accuracy_reward/group_std_mean": 0.1458025962114334, "signal/accuracy_reward/group_zero_std_frac": 0.5694444596767425, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05372721403837204, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05372721403837204, "signal/advantage_abs_mean": 0.0702558621764183, "signal/advantage_pre_scale_abs_mean": 0.0702558621764183, "signal/advantage_pre_scale_std": 0.1391596168279648, "signal/advantage_std": 0.1391596168279648, "signal/brier_reward/centered_abs_mean": 0.06366429775953293, "signal/brier_reward/group_std_mean": 0.08459177762269973, "signal/brier_reward/group_zero_std_frac": 0.08611111268401146, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.031832148879766466, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.031832148879766466, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0014485677354969084, "signal/format_reward/group_std_mean": 0.003388595022261143, "signal/format_reward/group_zero_std_frac": 0.9833333253860473, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0007242838677484542, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0007242838677484542, "signal/mean_confidence_reward/centered_abs_mean": 0.05934244841337204, "signal/mean_confidence_reward/group_std_mean": 0.07782920151948929, "signal/mean_confidence_reward/group_zero_std_frac": 0.09166666865348816, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.934244654781651e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.934244654781651e-07, "step": 765 }, { "calibration/aurc": 0.14769688122490074, "calibration/batch_distribution_entropy": 0.6292561751758475, "calibration/confidence_entropy": 0.5027214959125096, "calibration/coverage@0%": 0.27255204097647273, "calibration/coverage@1%": 0.27777397309135526, "calibration/coverage@10%": 0.30946796216239353, "calibration/coverage@15%": 0.5113771005435945, "calibration/coverage@20%": 0.7623651528770563, "calibration/coverage@25%": 0.8, "calibration/coverage@30%": 0.9098958333333332, "calibration/coverage@5%": 0.29526031188916946, "calibration/ece": 0.13428807965586606, "calibration/mean_confidence": 0.7125401556592332, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004079861111111094, "completions/max_length": 3991.8, "completions/max_terminated_length": 3991.8, "completions/mean_length": 1048.739697265625, "completions/mean_terminated_length": 1053.1176391601562, "completions/min_length": 57.8, "completions/min_terminated_length": 250.2, "epoch": 1.8495893801327483, "grad_norm": 0.00042604634654708207, "learning_rate": 1.622596153846154e-06, "loss": -0.002, "num_tokens": 1998991116.0, "reward": 1.30513174533844, "reward_std": 0.09186576753854751, "rewards/accuracy_reward": 0.7505208253860474, "rewards/brier_reward": 0.8638084769248963, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9959201335906982, "rewards/mean_confidence_reward": 0.6986440896987915, "signal/accuracy_reward/centered_abs_mean": 0.09621310830116273, "signal/accuracy_reward/group_std_mean": 0.1310509517788887, "signal/accuracy_reward/group_zero_std_frac": 0.6111111044883728, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04810655415058136, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04810655415058136, "signal/advantage_abs_mean": 0.06549866348505021, "signal/advantage_pre_scale_abs_mean": 0.06549866348505021, "signal/advantage_pre_scale_std": 0.14081084430217744, "signal/advantage_std": 0.14081084430217744, "signal/brier_reward/centered_abs_mean": 0.0581412062048912, "signal/brier_reward/group_std_mean": 0.07875731438398362, "signal/brier_reward/group_zero_std_frac": 0.10277777910232544, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0290706031024456, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0290706031024456, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.005767144169658423, "signal/format_reward/group_std_mean": 0.010861166566610337, "signal/format_reward/group_zero_std_frac": 0.9527777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0028835720848292114, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0028835720848292114, "signal/mean_confidence_reward/centered_abs_mean": 0.05620442777872085, "signal/mean_confidence_reward/group_std_mean": 0.07394770681858062, "signal/mean_confidence_reward/group_zero_std_frac": 0.10555555671453476, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.620442607323639e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.620442607323639e-07, "step": 770 }, { "calibration/aurc": 0.12282983667672753, "calibration/batch_distribution_entropy": 0.5988635746696642, "calibration/confidence_entropy": 0.4902626215720833, "calibration/coverage@0%": 0.10888961605584642, "calibration/coverage@1%": 0.10888961605584642, "calibration/coverage@10%": 0.5991982984293194, "calibration/coverage@15%": 0.6956451788830715, "calibration/coverage@20%": 0.8573025741710296, "calibration/coverage@25%": 0.8807400741710296, "calibration/coverage@30%": 0.8869900741710296, "calibration/coverage@5%": 0.10888961605584642, "calibration/ece": 0.10914757853403137, "calibration/mean_confidence": 0.7247114965095987, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032986111111110938, "completions/max_length": 3912.2, "completions/max_terminated_length": 3912.2, "completions/mean_length": 1059.7978393554688, "completions/mean_terminated_length": 1063.3177368164063, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 1.8615892301346233, "grad_norm": 0.0004538172797765583, "learning_rate": 1.592548076923077e-06, "loss": -0.0028, "num_tokens": 2014309843.0, "reward": 1.309248423576355, "reward_std": 0.09768530875444412, "rewards/accuracy_reward": 0.7598958492279053, "rewards/brier_reward": 0.8618854403495788, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9967013835906983, "rewards/mean_confidence_reward": 0.708142352104187, "signal/accuracy_reward/centered_abs_mean": 0.1010525181889534, "signal/accuracy_reward/group_std_mean": 0.13816336393356324, "signal/accuracy_reward/group_zero_std_frac": 0.5916666865348816, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0505262590944767, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0505262590944767, "signal/advantage_abs_mean": 0.06988802999258041, "signal/advantage_pre_scale_abs_mean": 0.06988802999258041, "signal/advantage_pre_scale_std": 0.14795678555965425, "signal/advantage_std": 0.14795678555965425, "signal/brier_reward/centered_abs_mean": 0.061630909144878385, "signal/brier_reward/group_std_mean": 0.08265089392662048, "signal/brier_reward/group_zero_std_frac": 0.12500000298023223, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.030815454572439192, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.030815454572439192, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.005707465356681496, "signal/format_reward/group_std_mean": 0.011728018708527087, "signal/format_reward/group_zero_std_frac": 0.9472222208976746, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002853732678340748, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002853732678340748, "signal/mean_confidence_reward/centered_abs_mean": 0.05657145157456398, "signal/mean_confidence_reward/group_std_mean": 0.07501066923141479, "signal/mean_confidence_reward/group_zero_std_frac": 0.12777777910232543, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.657144924953172e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.657144924953172e-07, "step": 775 }, { "calibration/aurc": 0.1193719989612461, "calibration/batch_distribution_entropy": 0.6663999884472174, "calibration/confidence_entropy": 0.4891572687700586, "calibration/coverage@0%": 0.24097547872778993, "calibration/coverage@1%": 0.28611983568317056, "calibration/coverage@10%": 0.41591980321831495, "calibration/coverage@15%": 0.6581829257190364, "calibration/coverage@20%": 0.7213390936980391, "calibration/coverage@25%": 0.8933518760907504, "calibration/coverage@30%": 0.9602694153577662, "calibration/coverage@5%": 0.32687196822910225, "calibration/ece": 0.10955297697571834, "calibration/mean_confidence": 0.6949274135644694, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003732638888888884, "completions/max_length": 3795.2, "completions/max_terminated_length": 3795.2, "completions/mean_length": 1024.5405639648438, "completions/mean_terminated_length": 1028.49833984375, "completions/min_length": 0.0, "completions/min_terminated_length": 258.2, "epoch": 1.8735890801364983, "grad_norm": 0.0005017994553782046, "learning_rate": 1.5625e-06, "loss": -0.0035, "num_tokens": 2029185222.0, "reward": 1.3203709363937377, "reward_std": 0.09855225682258606, "rewards/accuracy_reward": 0.7678819417953491, "rewards/brier_reward": 0.8765779376029968, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9962673544883728, "rewards/mean_confidence_reward": 0.729370653629303, "signal/accuracy_reward/centered_abs_mean": 0.09713541567325593, "signal/accuracy_reward/group_std_mean": 0.13192613124847413, "signal/accuracy_reward/group_zero_std_frac": 0.6083333492279053, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04856770783662796, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04856770783662796, "signal/advantage_abs_mean": 0.07029455974698066, "signal/advantage_pre_scale_abs_mean": 0.07029455974698066, "signal/advantage_pre_scale_std": 0.15004289150238037, "signal/advantage_std": 0.15004289150238037, "signal/brier_reward/centered_abs_mean": 0.060877188295125964, "signal/brier_reward/group_std_mean": 0.08337771445512772, "signal/brier_reward/group_zero_std_frac": 0.1361111134290695, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.030438594147562982, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.030438594147562982, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0065375435049645604, "signal/format_reward/group_std_mean": 0.01388431005179882, "signal/format_reward/group_zero_std_frac": 0.9361111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0032687717524822802, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0032687717524822802, "signal/mean_confidence_reward/centered_abs_mean": 0.05846787616610527, "signal/mean_confidence_reward/group_std_mean": 0.07618593722581864, "signal/mean_confidence_reward/group_zero_std_frac": 0.1361111134290695, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.846787303198653e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.846787303198653e-07, "step": 780 }, { "calibration/aurc": 0.10492474359836032, "calibration/batch_distribution_entropy": 0.6432680327611955, "calibration/confidence_entropy": 0.489598094266969, "calibration/coverage@0%": 0.1390625, "calibration/coverage@1%": 0.1390625, "calibration/coverage@10%": 0.5784622497824194, "calibration/coverage@15%": 0.6232539164490861, "calibration/coverage@20%": 0.78610068537859, "calibration/coverage@25%": 0.9514360313315926, "calibration/coverage@30%": 0.9832898172323759, "calibration/coverage@5%": 0.4404414164490861, "calibration/ece": 0.13960264360313307, "calibration/mean_confidence": 0.709441090078329, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0019097222222222098, "completions/max_length": 3787.0, "completions/max_terminated_length": 3787.0, "completions/mean_length": 1019.3605346679688, "completions/mean_terminated_length": 1021.4125122070312, "completions/min_length": 52.4, "completions/min_terminated_length": 241.6, "epoch": 1.8855889301383733, "grad_norm": 0.000506190990563482, "learning_rate": 1.5324519230769232e-06, "loss": -0.0008, "num_tokens": 2044005631.0, "reward": 1.3156396865844726, "reward_std": 0.10233260840177535, "rewards/accuracy_reward": 0.7608506917953491, "rewards/brier_reward": 0.872324001789093, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9980902791023254, "rewards/mean_confidence_reward": 0.7224869728088379, "signal/accuracy_reward/centered_abs_mean": 0.11386176347732543, "signal/accuracy_reward/group_std_mean": 0.1513701930642128, "signal/accuracy_reward/group_zero_std_frac": 0.5666666865348816, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05693088173866272, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05693088173866272, "signal/advantage_abs_mean": 0.07546657547354699, "signal/advantage_pre_scale_abs_mean": 0.07546657547354699, "signal/advantage_pre_scale_std": 0.1552779883146286, "signal/advantage_std": 0.1552779883146286, "signal/brier_reward/centered_abs_mean": 0.05938533172011375, "signal/brier_reward/group_std_mean": 0.08088604807853698, "signal/brier_reward/group_zero_std_frac": 0.2055555611848831, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.029692665860056876, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.029692665860056876, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.003526475699618459, "signal/format_reward/group_std_mean": 0.007791919447481632, "signal/format_reward/group_zero_std_frac": 0.9638888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0017632378498092295, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0017632378498092295, "signal/mean_confidence_reward/centered_abs_mean": 0.05164903253316879, "signal/mean_confidence_reward/group_std_mean": 0.06859839260578156, "signal/mean_confidence_reward/group_zero_std_frac": 0.2083333373069763, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.164903257082188e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.164903257082188e-07, "step": 785 }, { "calibration/aurc": 0.11490591917951534, "calibration/batch_distribution_entropy": 0.6071148821328102, "calibration/confidence_entropy": 0.4670101421079006, "calibration/coverage@0%": 0.0291005291005291, "calibration/coverage@1%": 0.1869130291005291, "calibration/coverage@10%": 0.5470956240416097, "calibration/coverage@15%": 0.7646576733391355, "calibration/coverage@20%": 0.8305870917775291, "calibration/coverage@25%": 0.8747669859574234, "calibration/coverage@30%": 0.9243386243386243, "calibration/coverage@5%": 0.3850255182215038, "calibration/ece": 0.10233554648106695, "calibration/mean_confidence": 0.7152846861746586, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003645833333333326, "completions/max_length": 3976.6, "completions/max_terminated_length": 3976.6, "completions/mean_length": 1038.4816162109375, "completions/mean_terminated_length": 1042.2583251953124, "completions/min_length": 0.0, "completions/min_terminated_length": 250.8, "epoch": 1.8975887801402482, "grad_norm": 0.0005224856431595981, "learning_rate": 1.5024038461538462e-06, "loss": -0.0005, "num_tokens": 2059056715.0, "reward": 1.3066727638244628, "reward_std": 0.10674758851528168, "rewards/accuracy_reward": 0.7521701455116272, "rewards/brier_reward": 0.8648936748504639, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9962673544883728, "rewards/mean_confidence_reward": 0.7175607442855835, "signal/accuracy_reward/centered_abs_mean": 0.11585828959941864, "signal/accuracy_reward/group_std_mean": 0.15172154009342192, "signal/accuracy_reward/group_zero_std_frac": 0.5666666626930237, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05792914479970932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05792914479970932, "signal/advantage_abs_mean": 0.07807408422231674, "signal/advantage_pre_scale_abs_mean": 0.07807408422231674, "signal/advantage_pre_scale_std": 0.1602114200592041, "signal/advantage_std": 0.1602114200592041, "signal/brier_reward/centered_abs_mean": 0.06447135657072067, "signal/brier_reward/group_std_mean": 0.08666761070489884, "signal/brier_reward/group_zero_std_frac": 0.17777778059244156, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.032235678285360336, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.032235678285360336, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.006526692677289248, "signal/format_reward/group_std_mean": 0.013552109338343143, "signal/format_reward/group_zero_std_frac": 0.9388888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.003263346338644624, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.003263346338644624, "signal/mean_confidence_reward/centered_abs_mean": 0.06045165881514549, "signal/mean_confidence_reward/group_std_mean": 0.07800682634115219, "signal/mean_confidence_reward/group_zero_std_frac": 0.1833333358168602, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.045165719115175e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.045165719115175e-07, "step": 790 }, { "calibration/aurc": 0.2344706607537596, "calibration/batch_distribution_entropy": 0.5717636396064506, "calibration/confidence_entropy": 0.4568149259450266, "calibration/coverage@0%": 0.0005235602094240838, "calibration/coverage@1%": 0.0005235602094240838, "calibration/coverage@10%": 0.16919196752012905, "calibration/coverage@15%": 0.1749360928464998, "calibration/coverage@20%": 0.3520194261798332, "calibration/coverage@25%": 0.5132089877835951, "calibration/coverage@30%": 0.8712832678883071, "calibration/coverage@5%": 0.11070632783344496, "calibration/ece": 0.15288041345422165, "calibration/mean_confidence": 0.7443476308501815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002517361111111138, "completions/max_length": 3708.6, "completions/max_terminated_length": 3708.6, "completions/mean_length": 984.4074829101562, "completions/mean_terminated_length": 986.9249633789062, "completions/min_length": 0.0, "completions/min_terminated_length": 267.2, "epoch": 1.9095886301421232, "grad_norm": 0.00044028559932485223, "learning_rate": 1.4723557692307693e-06, "loss": -0.002, "num_tokens": 2073471073.0, "reward": 1.2937115907669068, "reward_std": 0.09675345420837403, "rewards/accuracy_reward": 0.7311631917953492, "rewards/brier_reward": 0.8587623596191406, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9974826335906982, "rewards/mean_confidence_reward": 0.7448394060134887, "signal/accuracy_reward/centered_abs_mean": 0.09666341096162796, "signal/accuracy_reward/group_std_mean": 0.1310340642929077, "signal/accuracy_reward/group_zero_std_frac": 0.6055555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04833170548081398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04833170548081398, "signal/advantage_abs_mean": 0.06797323822975158, "signal/advantage_pre_scale_abs_mean": 0.06797323822975158, "signal/advantage_pre_scale_std": 0.14812732338905335, "signal/advantage_std": 0.14812732338905335, "signal/brier_reward/centered_abs_mean": 0.06116928979754448, "signal/brier_reward/group_std_mean": 0.08454878926277161, "signal/brier_reward/group_zero_std_frac": 0.20555555820465088, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03058464489877224, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03058464489877224, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0045084634679369625, "signal/format_reward/group_std_mean": 0.009642397239804267, "signal/format_reward/group_zero_std_frac": 0.955555546283722, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0022542317339684812, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0022542317339684812, "signal/mean_confidence_reward/centered_abs_mean": 0.051269262284040454, "signal/mean_confidence_reward/group_std_mean": 0.068087337911129, "signal/mean_confidence_reward/group_zero_std_frac": 0.22222222089767457, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.126925884724187e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.126925884724187e-07, "step": 795 }, { "calibration/aurc": 0.10023426754465517, "calibration/batch_distribution_entropy": 0.5951830831519115, "calibration/confidence_entropy": 0.45741941477828973, "calibration/coverage@0%": 0.2015625, "calibration/coverage@1%": 0.2015625, "calibration/coverage@10%": 0.6147486174215168, "calibration/coverage@15%": 0.6772486174215169, "calibration/coverage@20%": 0.8048673358552113, "calibration/coverage@25%": 0.8843782765568141, "calibration/coverage@30%": 0.936646007226414, "calibration/coverage@5%": 0.4330967145343777, "calibration/ece": 0.1305562335315542, "calibration/mean_confidence": 0.7449381432116596, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002343750000000022, "completions/max_length": 3782.8, "completions/max_terminated_length": 3782.8, "completions/mean_length": 1039.9683227539062, "completions/mean_terminated_length": 1042.4302490234375, "completions/min_length": 0.0, "completions/min_terminated_length": 288.8, "epoch": 1.9215884801439982, "grad_norm": 0.000610596383921802, "learning_rate": 1.4423076923076922e-06, "loss": -0.0001, "num_tokens": 2088578100.0, "reward": 1.313480830192566, "reward_std": 0.11972246766090393, "rewards/accuracy_reward": 0.7565104126930237, "rewards/brier_reward": 0.8727799415588379, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99765625, "rewards/mean_confidence_reward": 0.7530729055404664, "signal/accuracy_reward/centered_abs_mean": 0.1270562082529068, "signal/accuracy_reward/group_std_mean": 0.16453761160373687, "signal/accuracy_reward/group_zero_std_frac": 0.5416666686534881, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0635281041264534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0635281041264534, "signal/advantage_abs_mean": 0.08855947852134705, "signal/advantage_pre_scale_abs_mean": 0.08855947852134705, "signal/advantage_pre_scale_std": 0.17465413808822633, "signal/advantage_std": 0.17465413808822633, "signal/brier_reward/centered_abs_mean": 0.06844681799411774, "signal/brier_reward/group_std_mean": 0.09378228187561036, "signal/brier_reward/group_zero_std_frac": 0.20555555820465088, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03422340899705887, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03422340899705887, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.004389105853624642, "signal/format_reward/group_std_mean": 0.01032851729542017, "signal/format_reward/group_zero_std_frac": 0.950000011920929, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002194552926812321, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002194552926812321, "signal/mean_confidence_reward/centered_abs_mean": 0.05531005784869194, "signal/mean_confidence_reward/group_std_mean": 0.07360672652721405, "signal/mean_confidence_reward/group_zero_std_frac": 0.22777778208255767, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.531005172088043e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.531005172088043e-07, "step": 800 }, { "epoch": 1.9215884801439982, "eval_calibration/aurc": 0.11451958087356617, "eval_calibration/batch_distribution_entropy": 0.5586176200645917, "eval_calibration/confidence_entropy": 0.43380664292216453, "eval_calibration/coverage@0%": 0.20833333333333334, "eval_calibration/coverage@1%": 0.20833333333333334, "eval_calibration/coverage@10%": 0.4739583333333333, "eval_calibration/coverage@15%": 0.59375, "eval_calibration/coverage@20%": 0.8541666666666666, "eval_calibration/coverage@25%": 0.9322916666666666, "eval_calibration/coverage@30%": 0.9947916666666666, "eval_calibration/coverage@5%": 0.21354166666666666, "eval_calibration/ece": 0.13515625, "eval_calibration/mean_confidence": 0.7638020833333333, "eval_completions/clipped_ratio": 0.0017361111111111234, "eval_completions/max_length": 3256.0, "eval_completions/max_terminated_length": 3256.0, "eval_completions/mean_length": 997.5048828125, "eval_completions/mean_terminated_length": 999.2733561197916, "eval_completions/min_length": 191.0, "eval_completions/min_terminated_length": 282.1666666666667, "eval_loss": 0.0, "eval_num_tokens": 2088578100.0, "eval_reward": 1.292261262734731, "eval_reward_std": 0.31245610614617664, "eval_rewards/accuracy_reward": 0.7274305522441864, "eval_rewards/brier_reward": 0.8588129381338755, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9982638955116272, "eval_rewards/mean_confidence_reward": 0.7582898934682211, "eval_runtime": 184.8534, "eval_samples_per_second": 5.41, "eval_signal/accuracy_reward/centered_abs_mean": 0.3861762136220932, "eval_signal/accuracy_reward/group_std_mean": 0.44508100549379986, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1930881068110466, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.1930881068110466, "eval_signal/advantage_abs_mean": 0.2624824196100235, "eval_signal/advantage_pre_scale_abs_mean": 0.2624824196100235, "eval_signal/advantage_pre_scale_std": 0.31007521351178485, "eval_signal/advantage_std": 0.31007521351178485, "eval_signal/brier_reward/centered_abs_mean": 0.16589545706907907, "eval_signal/brier_reward/group_std_mean": 0.22468140721321106, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08294772853453954, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.08294772853453954, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.0033637151742974916, "eval_signal/format_reward/group_std_mean": 0.009820927555362383, "eval_signal/format_reward/group_zero_std_frac": 0.944444457689921, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0016818575871487458, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.166067140797774, "eval_signal/mean_confidence_reward/group_std_mean": 0.21214225639899573, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.6606713491758758e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.6606713491758758e-06, "eval_steps_per_second": 0.032, "step": 800 }, { "epoch": 1.9215884801439982, "step": 800, "train_probe_calibration/aurc": 0.11029869946141839, "train_probe_calibration/batch_distribution_entropy": 0.5833106783314401, "train_probe_calibration/confidence_entropy": 0.4457275208989561, "train_probe_calibration/coverage@0%": 0.0, "train_probe_calibration/coverage@1%": 0.0, "train_probe_calibration/coverage@10%": 0.5, "train_probe_calibration/coverage@15%": 0.7395833333333334, "train_probe_calibration/coverage@20%": 0.8958333333333334, "train_probe_calibration/coverage@25%": 0.9895833333333334, "train_probe_calibration/coverage@30%": 1.0, "train_probe_calibration/coverage@5%": 0.125, "train_probe_calibration/ece": 0.11093749999999998, "train_probe_calibration/mean_confidence": 0.7651041666666667, "train_probe_completions/clipped_ratio": 0.001736111111111105, "train_probe_completions/max_length": 3218.8333333333335, "train_probe_completions/max_terminated_length": 3218.8333333333335, "train_probe_completions/mean_length": 981.642567952474, "train_probe_completions/mean_terminated_length": 983.339589436849, "train_probe_completions/min_length": 217.0, "train_probe_completions/min_terminated_length": 252.5, "train_probe_loss": 0.0, "train_probe_num_tokens": 2088578100.0, "train_probe_reward": 1.3337881366411846, "train_probe_reward_std": 0.2803538590669632, "train_probe_rewards/accuracy_reward": 0.7890625099341074, "train_probe_rewards/brier_reward": 0.8802343805631002, "train_probe_rewards/confidence_one_or_zero": 0.0008680555814256271, "train_probe_rewards/format_reward": 0.9982638855775198, "train_probe_rewards/mean_confidence_reward": 0.770486076672872, "train_probe_runtime": 186.6446, "train_probe_samples_per_second": 5.358, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3210177967945735, "train_probe_signal/accuracy_reward/group_std_mean": 0.40238048632939655, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.16050889839728674, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.16050889839728674, "train_probe_signal/advantage_abs_mean": 0.2200667733947436, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2200667733947436, "train_probe_signal/advantage_pre_scale_std": 0.2804074635108312, "train_probe_signal/advantage_std": 0.2804074635108312, "train_probe_signal/brier_reward/centered_abs_mean": 0.14370146145423254, "train_probe_signal/brier_reward/group_std_mean": 0.20114165792862573, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07185073072711627, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07185073072711627, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0016818575871487458, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0049104637776811915, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 0.9722222288449606, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151002e-08, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151002e-08, "train_probe_signal/format_reward/centered_abs_mean": 0.0033637151742974916, "train_probe_signal/format_reward/group_std_mean": 0.009820927555362383, "train_probe_signal/format_reward/group_zero_std_frac": 0.9444444477558136, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0016818575871487458, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.0016818575871487458, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.1552788441379865, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.20243067542711893, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.5527884329458175e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.5527884329458175e-06, "train_probe_steps_per_second": 0.032 }, { "calibration/aurc": 0.06487691659516702, "calibration/batch_distribution_entropy": 0.4799956967732612, "calibration/confidence_entropy": 0.42141757518767103, "calibration/coverage@0%": 0.0010443864229765013, "calibration/coverage@1%": 0.3786485530896432, "calibration/coverage@10%": 0.6880235530896431, "calibration/coverage@15%": 0.8728323542210618, "calibration/coverage@20%": 0.965082408616188, "calibration/coverage@25%": 0.9953125, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.595836053089643, "calibration/ece": 0.11709808801131409, "calibration/mean_confidence": 0.7874485286118365, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001822916666666674, "completions/max_length": 3622.6, "completions/max_terminated_length": 3622.6, "completions/mean_length": 981.7848388671875, "completions/mean_terminated_length": 983.56513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 273.8, "epoch": 1.9335883301458732, "grad_norm": 0.0005223756888881326, "learning_rate": 1.4122596153846154e-06, "loss": -0.0008, "num_tokens": 2102989829.0, "reward": 1.3148869514465331, "reward_std": 0.09118190705776215, "rewards/accuracy_reward": 0.7559895753860474, "rewards/brier_reward": 0.8756787776947021, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9980902791023254, "rewards/mean_confidence_reward": 0.7634179711341857, "signal/accuracy_reward/centered_abs_mean": 0.09243164211511612, "signal/accuracy_reward/group_std_mean": 0.12682606726884843, "signal/accuracy_reward/group_zero_std_frac": 0.6194444537162781, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04621582105755806, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04621582105755806, "signal/advantage_abs_mean": 0.06489204093813897, "signal/advantage_pre_scale_abs_mean": 0.06489204093813897, "signal/advantage_pre_scale_std": 0.14571058452129365, "signal/advantage_std": 0.14571058452129365, "signal/brier_reward/centered_abs_mean": 0.054600251466035844, "signal/brier_reward/group_std_mean": 0.07518918961286544, "signal/brier_reward/group_zero_std_frac": 0.3138888865709305, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.027300125733017922, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.027300125733017922, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.00362413190305233, "signal/format_reward/group_std_mean": 0.008956741448491812, "signal/format_reward/group_zero_std_frac": 0.9555555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.001812065951526165, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.001812065951526165, "signal/mean_confidence_reward/centered_abs_mean": 0.0489314891397953, "signal/mean_confidence_reward/group_std_mean": 0.06458591893315316, "signal/mean_confidence_reward/group_zero_std_frac": 0.3277777820825577, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.893148854989704e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.893148854989704e-07, "step": 805 }, { "calibration/aurc": 0.16142285728868233, "calibration/batch_distribution_entropy": 0.4612159279923329, "calibration/confidence_entropy": 0.4099892554581054, "calibration/coverage@0%": 0.0093818208061786, "calibration/coverage@1%": 0.0093818208061786, "calibration/coverage@10%": 0.14201889652419425, "calibration/coverage@15%": 0.48876470809947714, "calibration/coverage@20%": 0.7192083003536112, "calibration/coverage@25%": 0.8761143398401897, "calibration/coverage@30%": 0.9007012795275591, "calibration/coverage@5%": 0.11695362237275823, "calibration/ece": 0.12546078346970668, "calibration/mean_confidence": 0.8101161903452507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001475694444444442, "completions/max_length": 3866.6, "completions/max_terminated_length": 3866.6, "completions/mean_length": 993.4398559570312, "completions/mean_terminated_length": 994.915380859375, "completions/min_length": 0.0, "completions/min_terminated_length": 263.8, "epoch": 1.9455881801477481, "grad_norm": 0.0005623899633064866, "learning_rate": 1.3822115384615387e-06, "loss": -0.0009, "num_tokens": 2117533360.0, "reward": 1.323251438140869, "reward_std": 0.10489301979541779, "rewards/accuracy_reward": 0.7852430582046509, "rewards/brier_reward": 0.8627194046974183, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998524296283722, "rewards/mean_confidence_reward": 0.8024001598358155, "signal/accuracy_reward/centered_abs_mean": 0.09473741203546523, "signal/accuracy_reward/group_std_mean": 0.1338774934411049, "signal/accuracy_reward/group_zero_std_frac": 0.5805555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04736870601773262, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04736870601773262, "signal/advantage_abs_mean": 0.07377447634935379, "signal/advantage_pre_scale_abs_mean": 0.07377447634935379, "signal/advantage_pre_scale_std": 0.16231746077537537, "signal/advantage_std": 0.16231746077537537, "signal/brier_reward/centered_abs_mean": 0.06395515948534011, "signal/brier_reward/group_std_mean": 0.0895268514752388, "signal/brier_reward/group_zero_std_frac": 0.2583333343267441, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.031977579742670056, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.031977579742670056, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.002826605807058513, "signal/format_reward/group_std_mean": 0.007450965791940689, "signal/format_reward/group_zero_std_frac": 0.9611111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0014133029035292566, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0014133029035292566, "signal/mean_confidence_reward/centered_abs_mean": 0.04411567375063896, "signal/mean_confidence_reward/group_std_mean": 0.0603493481874466, "signal/mean_confidence_reward/group_zero_std_frac": 0.2944444537162781, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.411567488205037e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.411567488205037e-07, "step": 810 }, { "calibration/aurc": 0.08099616400053547, "calibration/batch_distribution_entropy": 0.43837254670218523, "calibration/confidence_entropy": 0.40528665574797473, "calibration/coverage@0%": 0.13442434210526316, "calibration/coverage@1%": 0.30213267543859657, "calibration/coverage@10%": 0.7162678931336174, "calibration/coverage@15%": 0.7199232456140352, "calibration/coverage@20%": 0.9223355263157895, "calibration/coverage@25%": 0.9936842105263157, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.484906798245614, "calibration/ece": 0.1043220034011267, "calibration/mean_confidence": 0.7918174256218223, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0037326388888888618, "completions/max_length": 4050.4, "completions/max_terminated_length": 4050.4, "completions/mean_length": 1051.1509765625, "completions/mean_terminated_length": 1055.1711303710938, "completions/min_length": 0.0, "completions/min_terminated_length": 251.6, "epoch": 1.9575880301496231, "grad_norm": 0.0006331638433039188, "learning_rate": 1.3521634615384617e-06, "loss": -0.0036, "num_tokens": 2132739451.0, "reward": 1.2915933847427368, "reward_std": 0.11312725692987442, "rewards/accuracy_reward": 0.7342014074325561, "rewards/brier_reward": 0.8527894616127014, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9961805582046509, "rewards/mean_confidence_reward": 0.7735051870346069, "signal/accuracy_reward/centered_abs_mean": 0.10846354067325592, "signal/accuracy_reward/group_std_mean": 0.14508081674575807, "signal/accuracy_reward/group_zero_std_frac": 0.5805555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05423177033662796, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05423177033662796, "signal/advantage_abs_mean": 0.0818033717572689, "signal/advantage_pre_scale_abs_mean": 0.0818033717572689, "signal/advantage_pre_scale_std": 0.1729103296995163, "signal/advantage_std": 0.1729103296995163, "signal/brier_reward/centered_abs_mean": 0.06907694563269615, "signal/brier_reward/group_std_mean": 0.09413802623748779, "signal/brier_reward/group_zero_std_frac": 0.2611111104488373, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03453847281634807, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03453847281634807, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.0070095486007630825, "signal/format_reward/group_std_mean": 0.015860436484217644, "signal/format_reward/group_zero_std_frac": 0.925000011920929, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0035047743003815413, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0035047743003815413, "signal/mean_confidence_reward/centered_abs_mean": 0.04945590049028396, "signal/mean_confidence_reward/group_std_mean": 0.06690601259469986, "signal/mean_confidence_reward/group_zero_std_frac": 0.275, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.945589807903161e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.945589807903161e-07, "step": 815 }, { "calibration/aurc": 0.11589190977641957, "calibration/batch_distribution_entropy": 0.452526403917706, "calibration/confidence_entropy": 0.41250519697135885, "calibration/coverage@0%": 0.021885907504363, "calibration/coverage@1%": 0.021885907504363, "calibration/coverage@10%": 0.4390796017137598, "calibration/coverage@15%": 0.7147800457602559, "calibration/coverage@20%": 0.7774584389908821, "calibration/coverage@25%": 0.8140461387434555, "calibration/coverage@30%": 0.9890625, "calibration/coverage@5%": 0.31687936300174524, "calibration/ece": 0.09831729211094095, "calibration/mean_confidence": 0.8050522495802179, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006163194444444442, "completions/max_length": 4039.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 1072.0278930664062, "completions/mean_terminated_length": 1078.587060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 234.4, "epoch": 1.969587880151498, "grad_norm": 0.00054346484830603, "learning_rate": 1.3221153846153848e-06, "loss": -0.0076, "num_tokens": 2148172988.0, "reward": 1.3095946788787842, "reward_std": 0.10846559852361679, "rewards/accuracy_reward": 0.7594618201255798, "rewards/brier_reward": 0.8658751845359802, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9938368082046509, "rewards/mean_confidence_reward": 0.7777612686157227, "signal/accuracy_reward/centered_abs_mean": 0.10074327290058135, "signal/accuracy_reward/group_std_mean": 0.1359761267900467, "signal/accuracy_reward/group_zero_std_frac": 0.5972222208976745, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05037163645029068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05037163645029068, "signal/advantage_abs_mean": 0.07821892276406288, "signal/advantage_pre_scale_abs_mean": 0.07821892276406288, "signal/advantage_pre_scale_std": 0.1690058708190918, "signal/advantage_std": 0.1690058708190918, "signal/brier_reward/centered_abs_mean": 0.0682062529027462, "signal/brier_reward/group_std_mean": 0.09166969656944275, "signal/brier_reward/group_zero_std_frac": 0.28888889253139494, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0341031264513731, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0341031264513731, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.00993381068110466, "signal/format_reward/group_std_mean": 0.018109358102083205, "signal/format_reward/group_zero_std_frac": 0.9305555820465088, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00496690534055233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00496690534055233, "signal/mean_confidence_reward/centered_abs_mean": 0.04811898916959763, "signal/mean_confidence_reward/group_std_mean": 0.06453772634267807, "signal/mean_confidence_reward/group_zero_std_frac": 0.3027777761220932, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.811898975276562e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.811898975276562e-07, "step": 820 }, { "calibration/aurc": 0.1034640561628074, "calibration/batch_distribution_entropy": 0.48837533014045215, "calibration/confidence_entropy": 0.43127064790039754, "calibration/coverage@0%": 0.19106245991691, "calibration/coverage@1%": 0.31820599969211616, "calibration/coverage@10%": 0.5777059676424134, "calibration/coverage@15%": 0.7068726343090801, "calibration/coverage@20%": 0.7772169502617801, "calibration/coverage@25%": 0.8658731457242583, "calibration/coverage@30%": 0.907757962478185, "calibration/coverage@5%": 0.4713403596080436, "calibration/ece": 0.1382662549603276, "calibration/mean_confidence": 0.7890722410967546, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0043402777777777676, "completions/max_length": 4027.4, "completions/max_terminated_length": 4027.4, "completions/mean_length": 1003.509033203125, "completions/mean_terminated_length": 1007.8230712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 227.0, "epoch": 1.981587730153373, "grad_norm": 0.0005832580500282347, "learning_rate": 1.292067307692308e-06, "loss": -0.0046, "num_tokens": 2162821796.0, "reward": 1.3140793561935424, "reward_std": 0.10351951867341995, "rewards/accuracy_reward": 0.7637152910232544, "rewards/brier_reward": 0.8687680125236511, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9956597089767456, "rewards/mean_confidence_reward": 0.7881805539131165, "signal/accuracy_reward/centered_abs_mean": 0.10021701455116272, "signal/accuracy_reward/group_std_mean": 0.13445703089237213, "signal/accuracy_reward/group_zero_std_frac": 0.6, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05010850727558136, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05010850727558136, "signal/advantage_abs_mean": 0.07497808933258057, "signal/advantage_pre_scale_abs_mean": 0.07497808933258057, "signal/advantage_pre_scale_std": 0.16351228654384614, "signal/advantage_std": 0.16351228654384614, "signal/brier_reward/centered_abs_mean": 0.06002784073352814, "signal/brier_reward/group_std_mean": 0.08266732096672058, "signal/brier_reward/group_zero_std_frac": 0.272222226858139, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03001392036676407, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03001392036676407, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.007378471991978585, "signal/format_reward/group_std_mean": 0.013285787403583526, "signal/format_reward/group_zero_std_frac": 0.9472222208976746, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0036892359959892927, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0036892359959892927, "signal/mean_confidence_reward/centered_abs_mean": 0.04876487404108047, "signal/mean_confidence_reward/group_std_mean": 0.06459259018301963, "signal/mean_confidence_reward/group_zero_std_frac": 0.2861111104488373, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.876487366800575e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.876487366800575e-07, "step": 825 }, { "calibration/aurc": 0.08862981043732664, "calibration/batch_distribution_entropy": 0.5230403476270299, "calibration/confidence_entropy": 0.4400446699992747, "calibration/coverage@0%": 0.20470109878154918, "calibration/coverage@1%": 0.2411594321148825, "calibration/coverage@10%": 0.6226930272283207, "calibration/coverage@15%": 0.7415657366524042, "calibration/coverage@20%": 0.7488574033190709, "calibration/coverage@25%": 0.9431054177545691, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.607588860561654, "calibration/ece": 0.1083303284297977, "calibration/mean_confidence": 0.7779841447462853, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005121527777777746, "completions/max_length": 4021.4, "completions/max_terminated_length": 4021.4, "completions/mean_length": 1078.4727416992187, "completions/mean_terminated_length": 1084.0625, "completions/min_length": 0.0, "completions/min_terminated_length": 228.8, "epoch": 1.993587580155248, "grad_norm": 0.000507715973071754, "learning_rate": 1.2620192307692309e-06, "loss": -0.0056, "num_tokens": 2178369866.0, "reward": 1.3042982578277589, "reward_std": 0.10334762632846832, "rewards/accuracy_reward": 0.7440972328186035, "rewards/brier_reward": 0.8696058988571167, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9948784708976746, "rewards/mean_confidence_reward": 0.7456249952316284, "signal/accuracy_reward/centered_abs_mean": 0.09817708283662796, "signal/accuracy_reward/group_std_mean": 0.13541424423456191, "signal/accuracy_reward/group_zero_std_frac": 0.5916666626930237, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04908854141831398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04908854141831398, "signal/advantage_abs_mean": 0.07246098965406418, "signal/advantage_pre_scale_abs_mean": 0.07246098965406418, "signal/advantage_pre_scale_std": 0.15726800858974457, "signal/advantage_std": 0.15726800858974457, "signal/brier_reward/centered_abs_mean": 0.06244741380214691, "signal/brier_reward/group_std_mean": 0.08613227009773254, "signal/brier_reward/group_zero_std_frac": 0.2194444477558136, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.031223706901073456, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.031223706901073456, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.008610026258975268, "signal/format_reward/group_std_mean": 0.017031885124742983, "signal/format_reward/group_zero_std_frac": 0.925000011920929, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004305013129487634, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004305013129487634, "signal/mean_confidence_reward/centered_abs_mean": 0.05296441167593002, "signal/mean_confidence_reward/group_std_mean": 0.06911907643079758, "signal/mean_confidence_reward/group_zero_std_frac": 0.2250000059604645, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.296440804158919e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.296440804158919e-07, "step": 830 }, { "calibration/aurc": 0.09766838706953529, "calibration/batch_distribution_entropy": 0.5944667525426071, "calibration/confidence_entropy": 0.45899022606352624, "calibration/coverage@0%": 0.24882778503046127, "calibration/coverage@1%": 0.30731342471714534, "calibration/coverage@10%": 0.5458265339425588, "calibration/coverage@15%": 0.708419005657093, "calibration/coverage@20%": 0.8341261422976501, "calibration/coverage@25%": 0.9274151436031332, "calibration/coverage@30%": 0.9822454308093995, "calibration/coverage@5%": 0.45143875108790255, "calibration/ece": 0.13212630548302862, "calibration/mean_confidence": 0.7457698270234987, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004600694444444442, "completions/max_length": 3567.2, "completions/max_terminated_length": 3567.2, "completions/mean_length": 1093.4208740234376, "completions/mean_terminated_length": 1098.7288452148437, "completions/min_length": 81.6, "completions/min_terminated_length": 245.2, "epoch": 2.007199910001125, "grad_norm": 0.0006160699995234609, "learning_rate": 1.231971153846154e-06, "loss": -0.0058, "num_tokens": 2194258712.0, "reward": 1.2970847368240357, "reward_std": 0.1047350749373436, "rewards/accuracy_reward": 0.7355902791023254, "rewards/brier_reward": 0.8637727975845337, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916626930237, "rewards/mean_confidence_reward": 0.7374088287353515, "signal/accuracy_reward/centered_abs_mean": 0.09618055522441864, "signal/accuracy_reward/group_std_mean": 0.1332211673259735, "signal/accuracy_reward/group_zero_std_frac": 0.5972222208976745, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04809027761220932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04809027761220932, "signal/advantage_abs_mean": 0.07343522906303405, "signal/advantage_pre_scale_abs_mean": 0.07343522906303405, "signal/advantage_pre_scale_std": 0.16196149289608003, "signal/advantage_std": 0.16196149289608003, "signal/brier_reward/centered_abs_mean": 0.06278966441750526, "signal/brier_reward/group_std_mean": 0.08675736784934998, "signal/brier_reward/group_zero_std_frac": 0.18888888955116273, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03139483220875263, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03139483220875263, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.009320746641606092, "signal/format_reward/group_std_mean": 0.018390318378806114, "signal/format_reward/group_zero_std_frac": 0.9222222208976746, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004660373320803046, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004660373320803046, "signal/mean_confidence_reward/centered_abs_mean": 0.0540662981569767, "signal/mean_confidence_reward/group_std_mean": 0.07149812430143357, "signal/mean_confidence_reward/group_zero_std_frac": 0.19722222685813903, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.406629611570679e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.406629611570679e-07, "step": 835 }, { "calibration/aurc": 0.09978157354314617, "calibration/batch_distribution_entropy": 0.5650794941810595, "calibration/confidence_entropy": 0.4494567287824891, "calibration/coverage@0%": 0.22224924763985654, "calibration/coverage@1%": 0.3691662887688777, "calibration/coverage@10%": 0.45889571395198636, "calibration/coverage@15%": 0.6188909290273138, "calibration/coverage@20%": 0.8345386458375816, "calibration/coverage@25%": 0.9837270341207349, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.41850737244231906, "calibration/ece": 0.10342442245005017, "calibration/mean_confidence": 0.7250308993842118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00894097222222221, "completions/max_length": 4028.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 1127.3750244140624, "completions/mean_terminated_length": 1137.641796875, "completions/min_length": 0.0, "completions/min_terminated_length": 227.0, "epoch": 2.019199760003, "grad_norm": 0.0005550920031964779, "learning_rate": 1.201923076923077e-06, "loss": -0.0098, "num_tokens": 2210343672.0, "reward": 1.301007866859436, "reward_std": 0.11482644528150558, "rewards/accuracy_reward": 0.7499131917953491, "rewards/brier_reward": 0.8611158728599548, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9909722089767456, "rewards/mean_confidence_reward": 0.722387146949768, "signal/accuracy_reward/centered_abs_mean": 0.11458875685930252, "signal/accuracy_reward/group_std_mean": 0.1525307595729828, "signal/accuracy_reward/group_zero_std_frac": 0.5611111164093018, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05729437842965126, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05729437842965126, "signal/advantage_abs_mean": 0.08159027993679047, "signal/advantage_pre_scale_abs_mean": 0.08159027993679047, "signal/advantage_pre_scale_std": 0.17119903564453126, "signal/advantage_std": 0.17119903564453126, "signal/brier_reward/centered_abs_mean": 0.06552227512001992, "signal/brier_reward/group_std_mean": 0.0908414974808693, "signal/brier_reward/group_zero_std_frac": 0.19722222089767455, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03276113756000996, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03276113756000996, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.01477864570915699, "signal/format_reward/group_std_mean": 0.02585829570889473, "signal/format_reward/group_zero_std_frac": 0.9000000119209289, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007389322854578495, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007389322854578495, "signal/mean_confidence_reward/centered_abs_mean": 0.05664767697453499, "signal/mean_confidence_reward/group_std_mean": 0.0751990869641304, "signal/mean_confidence_reward/group_zero_std_frac": 0.20277777910232545, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.664767627422407e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.664767627422407e-07, "step": 840 }, { "calibration/aurc": 0.07807626745245026, "calibration/batch_distribution_entropy": 0.5423396008770174, "calibration/confidence_entropy": 0.4413449743933192, "calibration/coverage@0%": 0.1469038375411407, "calibration/coverage@1%": 0.31562076875807193, "calibration/coverage@10%": 0.6645313216056327, "calibration/coverage@15%": 0.6981729497354497, "calibration/coverage@20%": 0.8935763888888889, "calibration/coverage@25%": 0.9495701058201058, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.6112575120818231, "calibration/ece": 0.11914226867474897, "calibration/mean_confidence": 0.7790576373265844, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00512152777777779, "completions/max_length": 4064.4, "completions/max_terminated_length": 4064.4, "completions/mean_length": 1101.3343994140625, "completions/mean_terminated_length": 1107.0077392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 229.8, "epoch": 2.031199610004875, "grad_norm": 0.0006690862937830389, "learning_rate": 1.1718750000000001e-06, "loss": -0.0053, "num_tokens": 2226130596.0, "reward": 1.3089288473129272, "reward_std": 0.11670268476009368, "rewards/accuracy_reward": 0.7503472208976746, "rewards/brier_reward": 0.8725304007530212, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9949652910232544, "rewards/mean_confidence_reward": 0.7389062404632568, "signal/accuracy_reward/centered_abs_mean": 0.11129557490348815, "signal/accuracy_reward/group_std_mean": 0.15475783199071885, "signal/accuracy_reward/group_zero_std_frac": 0.5277777850627899, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05564778745174408, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05564778745174408, "signal/advantage_abs_mean": 0.08028728440403939, "signal/advantage_pre_scale_abs_mean": 0.08028728440403939, "signal/advantage_pre_scale_std": 0.1678263634443283, "signal/advantage_std": 0.1678263634443283, "signal/brier_reward/centered_abs_mean": 0.06271324902772904, "signal/brier_reward/group_std_mean": 0.08837952762842179, "signal/brier_reward/group_zero_std_frac": 0.1944444477558136, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03135662451386452, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03135662451386452, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.009288194496184588, "signal/format_reward/group_std_mean": 0.020809750258922576, "signal/format_reward/group_zero_std_frac": 0.9027777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004644097248092294, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004644097248092294, "signal/mean_confidence_reward/centered_abs_mean": 0.055296768248081204, "signal/mean_confidence_reward/group_std_mean": 0.07497987896203995, "signal/mean_confidence_reward/group_zero_std_frac": 0.205555559694767, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.52967657085901e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.52967657085901e-07, "step": 845 }, { "calibration/aurc": 0.14738662860723134, "calibration/batch_distribution_entropy": 0.5912355183803676, "calibration/confidence_entropy": 0.4613363734694335, "calibration/coverage@0%": 0.015143624492502016, "calibration/coverage@1%": 0.015143624492502016, "calibration/coverage@10%": 0.43637599306704206, "calibration/coverage@15%": 0.6349338102447382, "calibration/coverage@20%": 0.7480893498329984, "calibration/coverage@25%": 0.834199554358673, "calibration/coverage@30%": 0.8676732328134185, "calibration/coverage@5%": 0.25441064019930837, "calibration/ece": 0.1092543684839764, "calibration/mean_confidence": 0.7126866874194041, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006684027777777745, "completions/max_length": 4023.2, "completions/max_terminated_length": 4023.2, "completions/mean_length": 1137.7923461914063, "completions/mean_terminated_length": 1145.6076293945312, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 2.04319946000675, "grad_norm": 0.0005026277503930032, "learning_rate": 1.141826923076923e-06, "loss": -0.0091, "num_tokens": 2242353452.0, "reward": 1.3009058237075806, "reward_std": 0.12361187785863877, "rewards/accuracy_reward": 0.7506944417953492, "rewards/brier_reward": 0.8577864408493042, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9933159708976745, "rewards/mean_confidence_reward": 0.7437222242355347, "signal/accuracy_reward/centered_abs_mean": 0.12034505009651184, "signal/accuracy_reward/group_std_mean": 0.16336107552051543, "signal/accuracy_reward/group_zero_std_frac": 0.5138889014720917, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.06017252504825592, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06017252504825592, "signal/advantage_abs_mean": 0.0865494504570961, "signal/advantage_pre_scale_abs_mean": 0.0865494504570961, "signal/advantage_pre_scale_std": 0.1790706753730774, "signal/advantage_std": 0.1790706753730774, "signal/brier_reward/centered_abs_mean": 0.07205562740564346, "signal/brier_reward/group_std_mean": 0.0994638592004776, "signal/brier_reward/group_zero_std_frac": 0.22500000298023223, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03602781370282173, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03602781370282173, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.012169053591787816, "signal/format_reward/group_std_mean": 0.025033322721719743, "signal/format_reward/group_zero_std_frac": 0.8916666746139527, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006084526795893908, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006084526795893908, "signal/mean_confidence_reward/centered_abs_mean": 0.05648736655712128, "signal/mean_confidence_reward/group_std_mean": 0.07585688680410385, "signal/mean_confidence_reward/group_zero_std_frac": 0.24166666567325593, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.648736419061606e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.648736419061606e-07, "step": 850 }, { "epoch": 2.04319946000675, "eval_calibration/aurc": 0.08159028258611821, "eval_calibration/batch_distribution_entropy": 0.491488108718646, "eval_calibration/confidence_entropy": 0.41748531893969226, "eval_calibration/coverage@0%": 0.4729502688172043, "eval_calibration/coverage@1%": 0.4729502688172043, "eval_calibration/coverage@10%": 0.5668682795698925, "eval_calibration/coverage@15%": 0.8541666666666666, "eval_calibration/coverage@20%": 0.8958333333333334, "eval_calibration/coverage@25%": 0.9427083333333334, "eval_calibration/coverage@30%": 0.96875, "eval_calibration/coverage@5%": 0.4991599462365592, "eval_calibration/ece": 0.150739247311828, "eval_calibration/mean_confidence": 0.7854502688172044, "eval_completions/clipped_ratio": 0.00434027777777779, "eval_completions/max_length": 3374.0, "eval_completions/max_terminated_length": 3374.0, "eval_completions/mean_length": 1043.7161661783855, "eval_completions/mean_terminated_length": 1048.3323364257812, "eval_completions/min_length": 106.66666666666667, "eval_completions/min_terminated_length": 269.5, "eval_loss": 0.0, "eval_num_tokens": 2242353452.0, "eval_reward": 1.2891850868860881, "eval_reward_std": 0.3245917409658432, "eval_rewards/accuracy_reward": 0.7282986144224802, "eval_rewards/brier_reward": 0.8552644352118174, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9947916766007742, "eval_rewards/mean_confidence_reward": 0.7748958071072897, "eval_runtime": 209.3546, "eval_samples_per_second": 4.777, "eval_signal/accuracy_reward/centered_abs_mean": 0.3843858440717061, "eval_signal/accuracy_reward/group_std_mean": 0.44343380133310956, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19219292203585306, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.19219292203585306, "eval_signal/advantage_abs_mean": 0.2713930557171504, "eval_signal/advantage_pre_scale_abs_mean": 0.2713930557171504, "eval_signal/advantage_pre_scale_std": 0.3232206255197525, "eval_signal/advantage_std": 0.3232206255197525, "eval_signal/brier_reward/centered_abs_mean": 0.17644339551528296, "eval_signal/brier_reward/group_std_mean": 0.23666371405124664, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08822169775764148, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.08822169775764148, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.010091145522892475, "eval_signal/format_reward/group_std_mean": 0.02946278266608715, "eval_signal/format_reward/group_zero_std_frac": 0.8333333631356558, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0050455727614462376, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0050455727614462376, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.1646505817770958, "eval_signal/mean_confidence_reward/group_std_mean": 0.21078165620565414, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.6465057607698934e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.6465057607698934e-06, "eval_steps_per_second": 0.029, "step": 850 }, { "epoch": 2.04319946000675, "step": 850, "train_probe_calibration/aurc": 0.10552258862618018, "train_probe_calibration/batch_distribution_entropy": 0.47937084389474033, "train_probe_calibration/confidence_entropy": 0.4174242075540912, "train_probe_calibration/coverage@0%": 0.23991935483870966, "train_probe_calibration/coverage@1%": 0.23991935483870966, "train_probe_calibration/coverage@10%": 0.3597110215053763, "train_probe_calibration/coverage@15%": 0.8160282258064516, "train_probe_calibration/coverage@20%": 0.9465725806451614, "train_probe_calibration/coverage@25%": 0.9623655913978495, "train_probe_calibration/coverage@30%": 0.9946236559139785, "train_probe_calibration/coverage@5%": 0.2555443548387097, "train_probe_calibration/ece": 0.1207829301075269, "train_probe_calibration/mean_confidence": 0.780040322580645, "train_probe_completions/clipped_ratio": 0.008506944444444461, "train_probe_completions/max_length": 3671.8333333333335, "train_probe_completions/max_terminated_length": 3671.8333333333335, "train_probe_completions/mean_length": 1062.6993306477864, "train_probe_completions/mean_terminated_length": 1072.2170003255208, "train_probe_completions/min_length": 41.333333333333336, "train_probe_completions/min_terminated_length": 230.33333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 2242353452.0, "train_probe_reward": 1.3152791261672974, "train_probe_reward_std": 0.31444039940834045, "train_probe_rewards/accuracy_reward": 0.7725694477558136, "train_probe_rewards/brier_reward": 0.8649175862471262, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.9930555621782938, "train_probe_rewards/mean_confidence_reward": 0.7848958273728689, "train_probe_runtime": 209.926, "train_probe_samples_per_second": 4.764, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3421223958333333, "train_probe_signal/accuracy_reward/group_std_mean": 0.4178264339764913, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.17106119791666666, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.17106119791666666, "train_probe_signal/advantage_abs_mean": 0.2479413946469625, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2479413946469625, "train_probe_signal/advantage_pre_scale_std": 0.3138415813446045, "train_probe_signal/advantage_std": 0.3138415813446045, "train_probe_signal/brier_reward/centered_abs_mean": 0.16967979073524475, "train_probe_signal/brier_reward/group_std_mean": 0.23335971186558405, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08483989536762238, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.08483989536762238, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.013346354011446238, "train_probe_signal/format_reward/group_std_mean": 0.0362943010404706, "train_probe_signal/format_reward/group_zero_std_frac": 0.8055555820465088, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.006673177005723119, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.006673177005723119, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.15330943713585535, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.20380775878826776, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.5330942725692391e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.5330942725692391e-06, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.1597541365795626, "calibration/batch_distribution_entropy": 0.4829246260094659, "calibration/confidence_entropy": 0.4350086632908563, "calibration/coverage@0%": 0.009895833333333333, "calibration/coverage@1%": 0.009895833333333333, "calibration/coverage@10%": 0.6022633071553228, "calibration/coverage@15%": 0.6596640488656196, "calibration/coverage@20%": 0.6763907068062827, "calibration/coverage@25%": 0.7118073734729494, "calibration/coverage@30%": 0.7802083333333333, "calibration/coverage@5%": 0.20416666666666666, "calibration/ece": 0.10572746505304301, "calibration/mean_confidence": 0.7699966751436013, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003819444444444442, "completions/max_length": 3940.4, "completions/max_terminated_length": 3940.4, "completions/mean_length": 1043.3607543945313, "completions/mean_terminated_length": 1047.4768676757812, "completions/min_length": 0.0, "completions/min_terminated_length": 209.2, "epoch": 2.055199310008625, "grad_norm": 0.0005764501984231174, "learning_rate": 1.1117788461538462e-06, "loss": -0.0026, "num_tokens": 2257464648.0, "reward": 1.29567711353302, "reward_std": 0.1193440854549408, "rewards/accuracy_reward": 0.7410590171813964, "rewards/brier_reward": 0.85409916639328, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9961805462837219, "rewards/mean_confidence_reward": 0.7752447605133057, "signal/accuracy_reward/centered_abs_mean": 0.117431640625, "signal/accuracy_reward/group_std_mean": 0.15578319728374482, "signal/accuracy_reward/group_zero_std_frac": 0.5555555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0587158203125, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0587158203125, "signal/advantage_abs_mean": 0.08634682521224021, "signal/advantage_pre_scale_abs_mean": 0.08634682521224021, "signal/advantage_pre_scale_std": 0.17731020152568816, "signal/advantage_std": 0.17731020152568816, "signal/brier_reward/centered_abs_mean": 0.07291394025087357, "signal/brier_reward/group_std_mean": 0.09953448325395584, "signal/brier_reward/group_zero_std_frac": 0.23611111640930177, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036456970125436784, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036456970125436784, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.007052951352670788, "signal/format_reward/group_std_mean": 0.015664545819163322, "signal/format_reward/group_zero_std_frac": 0.9277777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.003526475676335394, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.003526475676335394, "signal/mean_confidence_reward/centered_abs_mean": 0.05578890517354011, "signal/mean_confidence_reward/group_std_mean": 0.07398995310068131, "signal/mean_confidence_reward/group_zero_std_frac": 0.26388889253139497, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.578890466040321e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.578890466040321e-07, "step": 855 }, { "calibration/aurc": 0.09355785889361831, "calibration/batch_distribution_entropy": 0.44840018734256165, "calibration/confidence_entropy": 0.409109052843769, "calibration/coverage@0%": 0.025124889743375713, "calibration/coverage@1%": 0.025124889743375713, "calibration/coverage@10%": 0.49056292096386345, "calibration/coverage@15%": 0.8404279285543377, "calibration/coverage@20%": 0.9039037472455235, "calibration/coverage@25%": 0.964490861618799, "calibration/coverage@30%": 0.9926892950391645, "calibration/coverage@5%": 0.3255165346519919, "calibration/ece": 0.09662744300891082, "calibration/mean_confidence": 0.7953523675949796, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003819444444444442, "completions/max_length": 4010.4, "completions/max_terminated_length": 4010.4, "completions/mean_length": 1088.1883056640625, "completions/mean_terminated_length": 1092.5062744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 2.0671991600105, "grad_norm": 0.000613938900642097, "learning_rate": 1.0817307692307693e-06, "loss": -0.0031, "num_tokens": 2273067393.0, "reward": 1.3072936058044433, "reward_std": 0.10972489565610885, "rewards/accuracy_reward": 0.7514756798744202, "rewards/brier_reward": 0.8669155478477478, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9961805701255798, "rewards/mean_confidence_reward": 0.7736467123031616, "signal/accuracy_reward/centered_abs_mean": 0.09902886301279068, "signal/accuracy_reward/group_std_mean": 0.1390759453177452, "signal/accuracy_reward/group_zero_std_frac": 0.575000011920929, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04951443150639534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04951443150639534, "signal/advantage_abs_mean": 0.07747430875897407, "signal/advantage_pre_scale_abs_mean": 0.07747430875897407, "signal/advantage_pre_scale_std": 0.16334012746810914, "signal/advantage_std": 0.16334012746810914, "signal/brier_reward/centered_abs_mean": 0.07115790098905564, "signal/brier_reward/group_std_mean": 0.0971034288406372, "signal/brier_reward/group_zero_std_frac": 0.22222222983837128, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03557895049452782, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03557895049452782, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.006434461602475494, "signal/format_reward/group_std_mean": 0.013291321508586407, "signal/format_reward/group_zero_std_frac": 0.9388889074325562, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.003217230801237747, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.003217230801237747, "signal/mean_confidence_reward/centered_abs_mean": 0.055993981659412384, "signal/mean_confidence_reward/group_std_mean": 0.07472211122512817, "signal/mean_confidence_reward/group_zero_std_frac": 0.24444444477558136, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.599398150479829e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.599398150479829e-07, "step": 860 }, { "calibration/aurc": 0.16649382844283, "calibration/batch_distribution_entropy": 0.41514272329618135, "calibration/confidence_entropy": 0.3999180361622753, "calibration/coverage@0%": 0.011481523023347134, "calibration/coverage@1%": 0.14150763268392155, "calibration/coverage@10%": 0.30944304055264304, "calibration/coverage@15%": 0.4766632637420173, "calibration/coverage@20%": 0.5940226107191983, "calibration/coverage@25%": 0.7648862955282901, "calibration/coverage@30%": 0.953123640121845, "calibration/coverage@5%": 0.16343974756642807, "calibration/ece": 0.129632055686544, "calibration/mean_confidence": 0.801827122924087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00269097222222221, "completions/max_length": 3933.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 1017.4327270507813, "completions/mean_terminated_length": 1020.2711303710937, "completions/min_length": 42.4, "completions/min_terminated_length": 233.8, "epoch": 2.079199010012375, "grad_norm": 0.0005889848689548671, "learning_rate": 1.0516826923076925e-06, "loss": -0.0017, "num_tokens": 2287900314.0, "reward": 1.3201057195663453, "reward_std": 0.10836056768894195, "rewards/accuracy_reward": 0.7749131798744202, "rewards/brier_reward": 0.8679733037948608, "rewards/confidence_one_or_zero": 0.0006944444554392249, "rewards/format_reward": 0.9973090291023254, "rewards/mean_confidence_reward": 0.7935234189033509, "signal/accuracy_reward/centered_abs_mean": 0.10527886152267456, "signal/accuracy_reward/group_std_mean": 0.14146318435668945, "signal/accuracy_reward/group_zero_std_frac": 0.5805555701255798, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05263943076133728, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05263943076133728, "signal/advantage_abs_mean": 0.07752762287855149, "signal/advantage_pre_scale_abs_mean": 0.07752762287855149, "signal/advantage_pre_scale_std": 0.16775264739990234, "signal/advantage_std": 0.16775264739990234, "signal/brier_reward/centered_abs_mean": 0.06674118638038636, "signal/brier_reward/group_std_mean": 0.09235648214817047, "signal/brier_reward/group_zero_std_frac": 0.25000000596046446, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03337059319019318, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03337059319019318, "signal/confidence_one_or_zero/centered_abs_mean": 0.0011176215368323029, "signal/confidence_one_or_zero/group_std_mean": 0.0016577502712607384, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.117621479806985e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.117621479806985e-08, "signal/format_reward/centered_abs_mean": 0.0049858940532431, "signal/format_reward/group_std_mean": 0.010996199026703835, "signal/format_reward/group_zero_std_frac": 0.95, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00249294702662155, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00249294702662155, "signal/mean_confidence_reward/centered_abs_mean": 0.04814069420099258, "signal/mean_confidence_reward/group_std_mean": 0.06465215906500817, "signal/mean_confidence_reward/group_zero_std_frac": 0.27777778804302217, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.814069370695506e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.814069370695506e-07, "step": 865 }, { "calibration/aurc": 0.0803849323735587, "calibration/batch_distribution_entropy": 0.40801975891128184, "calibration/confidence_entropy": 0.38967398830057814, "calibration/coverage@0%": 0.05071618072446441, "calibration/coverage@1%": 0.3345293368924434, "calibration/coverage@10%": 0.688358171524371, "calibration/coverage@15%": 0.759487660778332, "calibration/coverage@20%": 0.9431309991273997, "calibration/coverage@25%": 0.9822916666666668, "calibration/coverage@30%": 0.9953125, "calibration/coverage@5%": 0.38095732245674785, "calibration/ece": 0.0790981114987426, "calibration/mean_confidence": 0.8139067363286887, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003211805555555558, "completions/max_length": 3998.0, "completions/max_terminated_length": 3998.0, "completions/mean_length": 952.830126953125, "completions/mean_terminated_length": 955.8805541992188, "completions/min_length": 0.0, "completions/min_terminated_length": 215.6, "epoch": 2.09119886001425, "grad_norm": 0.0005842600367031991, "learning_rate": 1.0216346153846154e-06, "loss": -0.0024, "num_tokens": 2301930421.0, "reward": 1.327964997291565, "reward_std": 0.10693805068731307, "rewards/accuracy_reward": 0.7835937380790711, "rewards/brier_reward": 0.8755320549011231, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967882037162781, "rewards/mean_confidence_reward": 0.8038428664207459, "signal/accuracy_reward/centered_abs_mean": 0.09854058176279068, "signal/accuracy_reward/group_std_mean": 0.13622574806213378, "signal/accuracy_reward/group_zero_std_frac": 0.5944444537162781, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04927029088139534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04927029088139534, "signal/advantage_abs_mean": 0.07595544531941414, "signal/advantage_pre_scale_abs_mean": 0.07595544531941414, "signal/advantage_pre_scale_std": 0.16747937798500062, "signal/advantage_std": 0.16747937798500062, "signal/brier_reward/centered_abs_mean": 0.0652807392179966, "signal/brier_reward/group_std_mean": 0.09026803225278854, "signal/brier_reward/group_zero_std_frac": 0.21388889253139495, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0326403696089983, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0326403696089983, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.00553927943110466, "signal/format_reward/group_std_mean": 0.01055721715092659, "signal/format_reward/group_zero_std_frac": 0.9555555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00276963971555233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00276963971555233, "signal/mean_confidence_reward/centered_abs_mean": 0.04773189425468445, "signal/mean_confidence_reward/group_std_mean": 0.06455591842532157, "signal/mean_confidence_reward/group_zero_std_frac": 0.2472222238779068, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.773189289153379e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.773189289153379e-07, "step": 870 }, { "calibration/aurc": 0.12063256173446094, "calibration/batch_distribution_entropy": 0.41709659852608383, "calibration/confidence_entropy": 0.39141614254982426, "calibration/coverage@0%": 0.058928959965187114, "calibration/coverage@1%": 0.058928959965187114, "calibration/coverage@10%": 0.36388979547432554, "calibration/coverage@15%": 0.6742683855526544, "calibration/coverage@20%": 0.909260770234987, "calibration/coverage@25%": 0.985378590078329, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.20723183202785028, "calibration/ece": 0.07688216655787637, "calibration/mean_confidence": 0.8074188560704962, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0014756944444444641, "completions/max_length": 3899.8, "completions/max_terminated_length": 3899.8, "completions/mean_length": 1032.3447387695312, "completions/mean_terminated_length": 1033.8386474609374, "completions/min_length": 54.2, "completions/min_terminated_length": 248.6, "epoch": 2.103198710016125, "grad_norm": 0.0005836205673404038, "learning_rate": 9.915865384615386e-07, "loss": 0.0024, "num_tokens": 2316928248.0, "reward": 1.3029038429260253, "reward_std": 0.1198845311999321, "rewards/accuracy_reward": 0.7480034589767456, "rewards/brier_reward": 0.8592642426490784, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9985243082046509, "rewards/mean_confidence_reward": 0.7874478936195374, "signal/accuracy_reward/centered_abs_mean": 0.113623046875, "signal/accuracy_reward/group_std_mean": 0.15551528632640838, "signal/accuracy_reward/group_zero_std_frac": 0.5361111104488373, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0568115234375, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0568115234375, "signal/advantage_abs_mean": 0.08550895154476165, "signal/advantage_pre_scale_abs_mean": 0.08550895154476165, "signal/advantage_pre_scale_std": 0.17388284504413604, "signal/advantage_std": 0.17388284504413604, "signal/brier_reward/centered_abs_mean": 0.07246861308813095, "signal/brier_reward/group_std_mean": 0.1001792460680008, "signal/brier_reward/group_zero_std_frac": 0.14722222238779067, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036234306544065474, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036234306544065474, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.0028374565299600363, "signal/format_reward/group_std_mean": 0.007749906368553638, "signal/format_reward/group_zero_std_frac": 0.9583333253860473, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0014187282649800181, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0014187282649800181, "signal/mean_confidence_reward/centered_abs_mean": 0.056150124222040174, "signal/mean_confidence_reward/group_std_mean": 0.07590972036123275, "signal/mean_confidence_reward/group_zero_std_frac": 0.18055555820465088, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.615012355519866e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.615012355519866e-07, "step": 875 }, { "calibration/aurc": 0.12072107709448598, "calibration/batch_distribution_entropy": 0.47031912651986535, "calibration/confidence_entropy": 0.4150360970445607, "calibration/coverage@0%": 0.037510964912280696, "calibration/coverage@1%": 0.037510964912280696, "calibration/coverage@10%": 0.37396929824561403, "calibration/coverage@15%": 0.7962656028125144, "calibration/coverage@20%": 0.833634023865146, "calibration/coverage@25%": 0.8984320175438597, "calibration/coverage@30%": 0.9356414473684211, "calibration/coverage@5%": 0.345844298245614, "calibration/ece": 0.09727097146832493, "calibration/mean_confidence": 0.772867034692639, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3925.8, "completions/max_terminated_length": 3925.8, "completions/mean_length": 1015.885595703125, "completions/mean_terminated_length": 1019.98447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 210.2, "epoch": 2.115198560018, "grad_norm": 0.0004951026639901102, "learning_rate": 9.615384615384617e-07, "loss": -0.0033, "num_tokens": 2331715794.0, "reward": 1.318756365776062, "reward_std": 0.10814566612243652, "rewards/accuracy_reward": 0.7736979126930237, "rewards/brier_reward": 0.8677921056747436, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9960069417953491, "rewards/mean_confidence_reward": 0.7930642366409302, "signal/accuracy_reward/centered_abs_mean": 0.0947970911860466, "signal/accuracy_reward/group_std_mean": 0.13703084737062454, "signal/accuracy_reward/group_zero_std_frac": 0.5638888895511627, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0473985455930233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0473985455930233, "signal/advantage_abs_mean": 0.07385501414537429, "signal/advantage_pre_scale_abs_mean": 0.07385501414537429, "signal/advantage_pre_scale_std": 0.16370636224746704, "signal/advantage_std": 0.16370636224746704, "signal/brier_reward/centered_abs_mean": 0.06473049521446228, "signal/brier_reward/group_std_mean": 0.09205182790756225, "signal/brier_reward/group_zero_std_frac": 0.18611111044883727, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03236524760723114, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03236524760723114, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.006749131891410798, "signal/format_reward/group_std_mean": 0.013331235013902187, "signal/format_reward/group_zero_std_frac": 0.9416666746139526, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.003374565945705399, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.003374565945705399, "signal/mean_confidence_reward/centered_abs_mean": 0.050641607493162155, "signal/mean_confidence_reward/group_std_mean": 0.06986021250486374, "signal/mean_confidence_reward/group_zero_std_frac": 0.20833333432674409, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.064160632173298e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.064160632173298e-07, "step": 880 }, { "calibration/aurc": 0.15065716792697087, "calibration/batch_distribution_entropy": 0.5296564951696621, "calibration/confidence_entropy": 0.4199783583511499, "calibration/coverage@0%": 0.031777646963669755, "calibration/coverage@1%": 0.11898391328220762, "calibration/coverage@10%": 0.2471089132822076, "calibration/coverage@15%": 0.45862708125435725, "calibration/coverage@20%": 0.8443996230503192, "calibration/coverage@25%": 0.8777832718753846, "calibration/coverage@30%": 0.9702349869451699, "calibration/coverage@5%": 0.2471089132822076, "calibration/ece": 0.10150680451701684, "calibration/mean_confidence": 0.7587634881401538, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00225694444444442, "completions/max_length": 3866.4, "completions/max_terminated_length": 3866.4, "completions/mean_length": 975.52880859375, "completions/mean_terminated_length": 977.7569946289062, "completions/min_length": 0.0, "completions/min_terminated_length": 231.6, "epoch": 2.127198410019875, "grad_norm": 0.0005515318480320275, "learning_rate": 9.314903846153847e-07, "loss": -0.002, "num_tokens": 2346027550.0, "reward": 1.3281777381896973, "reward_std": 0.09435855001211166, "rewards/accuracy_reward": 0.7786458253860473, "rewards/brier_reward": 0.8800376653671265, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99765625, "rewards/mean_confidence_reward": 0.7876579642295838, "signal/accuracy_reward/centered_abs_mean": 0.08773871511220932, "signal/accuracy_reward/group_std_mean": 0.11791349202394485, "signal/accuracy_reward/group_zero_std_frac": 0.6527777791023255, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04386935755610466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04386935755610466, "signal/advantage_abs_mean": 0.06953106299042702, "signal/advantage_pre_scale_abs_mean": 0.06953106299042702, "signal/advantage_pre_scale_std": 0.15658720433712006, "signal/advantage_std": 0.15658720433712006, "signal/brier_reward/centered_abs_mean": 0.06259142756462097, "signal/brier_reward/group_std_mean": 0.08321797847747803, "signal/brier_reward/group_zero_std_frac": 0.25277777910232546, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03129571378231048, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03129571378231048, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.004432508710306138, "signal/format_reward/group_std_mean": 0.01076145824044943, "signal/format_reward/group_zero_std_frac": 0.9472222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002216254355153069, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002216254355153069, "signal/mean_confidence_reward/centered_abs_mean": 0.04899854734539986, "signal/mean_confidence_reward/group_std_mean": 0.06584918051958084, "signal/mean_confidence_reward/group_zero_std_frac": 0.2694444477558136, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.899854559425876e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.899854559425876e-07, "step": 885 }, { "calibration/aurc": 0.042939076052349555, "calibration/batch_distribution_entropy": 0.4785760710810291, "calibration/confidence_entropy": 0.4211706358035331, "calibration/coverage@0%": 0.0031304537521815007, "calibration/coverage@1%": 0.26679482984293196, "calibration/coverage@10%": 0.8383180628272251, "calibration/coverage@15%": 0.9478212260034905, "calibration/coverage@20%": 0.9837696335078533, "calibration/coverage@25%": 1.0, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.7732138961605585, "calibration/ece": 0.12707842495636984, "calibration/mean_confidence": 0.7573167539267017, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003298611111111116, "completions/max_length": 3986.4, "completions/max_terminated_length": 3986.4, "completions/mean_length": 1034.3697265625, "completions/mean_terminated_length": 1037.7999755859375, "completions/min_length": 0.0, "completions/min_terminated_length": 242.8, "epoch": 2.13919826002175, "grad_norm": 0.00047502669622190297, "learning_rate": 9.014423076923078e-07, "loss": -0.0035, "num_tokens": 2361055745.0, "reward": 1.3416155099868774, "reward_std": 0.09534274190664291, "rewards/accuracy_reward": 0.7915798664093018, "rewards/brier_reward": 0.8949343681335449, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967013835906983, "rewards/mean_confidence_reward": 0.7690303564071655, "signal/accuracy_reward/centered_abs_mean": 0.0891547292470932, "signal/accuracy_reward/group_std_mean": 0.1299699291586876, "signal/accuracy_reward/group_zero_std_frac": 0.5861111164093018, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0445773646235466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0445773646235466, "signal/advantage_abs_mean": 0.0632906898856163, "signal/advantage_pre_scale_abs_mean": 0.0632906898856163, "signal/advantage_pre_scale_std": 0.1460734248161316, "signal/advantage_std": 0.1460734248161316, "signal/brier_reward/centered_abs_mean": 0.05724892318248749, "signal/brier_reward/group_std_mean": 0.08257125169038773, "signal/brier_reward/group_zero_std_frac": 0.2111111104488373, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.028624461591243745, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.028624461591243745, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.005707465356681496, "signal/format_reward/group_std_mean": 0.011015595495700836, "signal/format_reward/group_zero_std_frac": 0.9527777791023254, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002853732678340748, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002853732678340748, "signal/mean_confidence_reward/centered_abs_mean": 0.04548492580652237, "signal/mean_confidence_reward/group_std_mean": 0.062452132999897006, "signal/mean_confidence_reward/group_zero_std_frac": 0.23611111044883729, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.5484923703043025e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.5484923703043025e-07, "step": 890 }, { "calibration/aurc": 0.0924619984576833, "calibration/batch_distribution_entropy": 0.5231847841893481, "calibration/confidence_entropy": 0.43102382957704577, "calibration/coverage@0%": 0.00783289817232376, "calibration/coverage@1%": 0.10809399477806789, "calibration/coverage@10%": 0.5684603459530025, "calibration/coverage@15%": 0.8173275674499566, "calibration/coverage@20%": 0.8909078546562229, "calibration/coverage@25%": 0.9373368146214099, "calibration/coverage@30%": 0.989556135770235, "calibration/coverage@5%": 0.3793624891209747, "calibration/ece": 0.0929002121409921, "calibration/mean_confidence": 0.7666374292863359, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 3888.0, "completions/max_terminated_length": 3888.0, "completions/mean_length": 1024.5815185546876, "completions/mean_terminated_length": 1027.797705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 216.6, "epoch": 2.151198110023625, "grad_norm": 0.0005185551126487553, "learning_rate": 8.713942307692308e-07, "loss": -0.001, "num_tokens": 2375939852.0, "reward": 1.3130956172943116, "reward_std": 0.09918684959411621, "rewards/accuracy_reward": 0.7572048664093017, "rewards/brier_reward": 0.8721829295158386, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967881917953492, "rewards/mean_confidence_reward": 0.7665390372276306, "signal/accuracy_reward/centered_abs_mean": 0.0900119349360466, "signal/accuracy_reward/group_std_mean": 0.12982572615146637, "signal/accuracy_reward/group_zero_std_frac": 0.5805555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0450059674680233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0450059674680233, "signal/advantage_abs_mean": 0.0662723034620285, "signal/advantage_pre_scale_abs_mean": 0.0662723034620285, "signal/advantage_pre_scale_std": 0.1487656533718109, "signal/advantage_std": 0.1487656533718109, "signal/brier_reward/centered_abs_mean": 0.059759201109409334, "signal/brier_reward/group_std_mean": 0.08652634769678116, "signal/brier_reward/group_zero_std_frac": 0.2111111104488373, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.029879600554704667, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.029879600554704667, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.006027560774236917, "signal/format_reward/group_std_mean": 0.014043217524886131, "signal/format_reward/group_zero_std_frac": 0.9333333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0030137803871184586, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0030137803871184586, "signal/mean_confidence_reward/centered_abs_mean": 0.05555691495537758, "signal/mean_confidence_reward/group_std_mean": 0.07538115680217743, "signal/mean_confidence_reward/group_zero_std_frac": 0.225, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.555691473091429e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.555691473091429e-07, "step": 895 }, { "calibration/aurc": 0.14375380157233436, "calibration/batch_distribution_entropy": 0.47601640564722214, "calibration/confidence_entropy": 0.4212545635807321, "calibration/coverage@0%": 0.008855526544821583, "calibration/coverage@1%": 0.008855526544821583, "calibration/coverage@10%": 0.3312513598781549, "calibration/coverage@15%": 0.5134777523933856, "calibration/coverage@20%": 0.8248259355961706, "calibration/coverage@25%": 0.9400225739773717, "calibration/coverage@30%": 0.9973890339425587, "calibration/coverage@5%": 0.11510552654482158, "calibration/ece": 0.10612883485639683, "calibration/mean_confidence": 0.7425277415143603, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0016493055555555803, "completions/max_length": 3845.0, "completions/max_terminated_length": 3845.0, "completions/mean_length": 999.8962646484375, "completions/mean_terminated_length": 1001.573291015625, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 2.1631979600255, "grad_norm": 0.0004322128079365939, "learning_rate": 8.41346153846154e-07, "loss": 0.0002, "num_tokens": 2390547457.0, "reward": 1.3193276643753051, "reward_std": 0.08748537600040436, "rewards/accuracy_reward": 0.7678819417953491, "rewards/brier_reward": 0.8724077701568603, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9983506917953491, "rewards/mean_confidence_reward": 0.7500564217567444, "signal/accuracy_reward/centered_abs_mean": 0.08206380158662796, "signal/accuracy_reward/group_std_mean": 0.11971472054719925, "signal/accuracy_reward/group_zero_std_frac": 0.6194444537162781, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04103190079331398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04103190079331398, "signal/advantage_abs_mean": 0.05862867832183838, "signal/advantage_pre_scale_abs_mean": 0.05862867832183838, "signal/advantage_pre_scale_std": 0.13332152664661406, "signal/advantage_std": 0.13332152664661406, "signal/brier_reward/centered_abs_mean": 0.05863209068775177, "signal/brier_reward/group_std_mean": 0.0812401831150055, "signal/brier_reward/group_zero_std_frac": 0.2555555611848831, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.029316045343875885, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.029316045343875885, "signal/confidence_one_or_zero/centered_abs_mean": 0.0001681857625953853, "signal/confidence_one_or_zero/group_std_mean": 0.0004910463467240334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9972222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.6818574977151001e-09, "signal/format_reward/centered_abs_mean": 0.0031195745687000453, "signal/format_reward/group_std_mean": 0.007483602315187454, "signal/format_reward/group_zero_std_frac": 0.9638888835906982, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0015597872843500227, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0015597872843500227, "signal/mean_confidence_reward/centered_abs_mean": 0.05635335743427276, "signal/mean_confidence_reward/group_std_mean": 0.07533224448561668, "signal/mean_confidence_reward/group_zero_std_frac": 0.26388889253139497, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.635335526221752e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.635335526221752e-07, "step": 900 }, { "epoch": 2.1631979600255, "eval_calibration/aurc": 0.10131349149295758, "eval_calibration/batch_distribution_entropy": 0.5412362261407814, "eval_calibration/confidence_entropy": 0.43370878218879855, "eval_calibration/coverage@0%": 0.11979166666666667, "eval_calibration/coverage@1%": 0.11979166666666667, "eval_calibration/coverage@10%": 0.5520833333333334, "eval_calibration/coverage@15%": 0.703125, "eval_calibration/coverage@20%": 0.9270833333333334, "eval_calibration/coverage@25%": 1.0, "eval_calibration/coverage@30%": 1.0, "eval_calibration/coverage@5%": 0.13541666666666666, "eval_calibration/ece": 0.1130208333333333, "eval_calibration/mean_confidence": 0.7489583333333334, "eval_completions/clipped_ratio": 0.0034722222222222285, "eval_completions/max_length": 3210.6666666666665, "eval_completions/max_terminated_length": 3210.6666666666665, "eval_completions/mean_length": 1010.8917134602865, "eval_completions/mean_terminated_length": 1014.4606221516927, "eval_completions/min_length": 196.83333333333334, "eval_completions/min_terminated_length": 286.6666666666667, "eval_loss": 0.0, "eval_num_tokens": 2390547457.0, "eval_reward": 1.287815531094869, "eval_reward_std": 0.3091672907272975, "eval_rewards/accuracy_reward": 0.7282986144224802, "eval_rewards/brier_reward": 0.8507899542649587, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.996527781089147, "eval_rewards/mean_confidence_reward": 0.7367187341054281, "eval_runtime": 191.4532, "eval_samples_per_second": 5.223, "eval_signal/accuracy_reward/centered_abs_mean": 0.3834092865387599, "eval_signal/accuracy_reward/group_std_mean": 0.44222476085027057, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19170464326937994, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.19170464326937994, "eval_signal/advantage_abs_mean": 0.25933244079351425, "eval_signal/advantage_pre_scale_abs_mean": 0.25933244079351425, "eval_signal/advantage_pre_scale_std": 0.3080225835243861, "eval_signal/advantage_std": 0.3080225835243861, "eval_signal/brier_reward/centered_abs_mean": 0.1773057853182157, "eval_signal/brier_reward/group_std_mean": 0.2340771108865738, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08865289265910785, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.08865289265910785, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/group_std_mean": 0.0, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "eval_signal/format_reward/centered_abs_mean": 0.0067274305038154125, "eval_signal/format_reward/group_std_mean": 0.01964185480028391, "eval_signal/format_reward/group_zero_std_frac": 0.8888888955116272, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0033637152519077063, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0033637152519077063, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.1927707021435102, "eval_signal/mean_confidence_reward/group_std_mean": 0.23622775574525198, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.927707065381886e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.927707065381886e-06, "eval_steps_per_second": 0.031, "step": 900 }, { "epoch": 2.1631979600255, "step": 900, "train_probe_calibration/aurc": 0.0864932860540263, "train_probe_calibration/batch_distribution_entropy": 0.5590722730163721, "train_probe_calibration/confidence_entropy": 0.4374332695893146, "train_probe_calibration/coverage@0%": 0.19808467741935484, "train_probe_calibration/coverage@1%": 0.19808467741935484, "train_probe_calibration/coverage@10%": 0.691364247311828, "train_probe_calibration/coverage@15%": 0.8061155913978495, "train_probe_calibration/coverage@20%": 0.9420362903225806, "train_probe_calibration/coverage@25%": 0.9838709677419355, "train_probe_calibration/coverage@30%": 1.0, "train_probe_calibration/coverage@5%": 0.2553763440860215, "train_probe_calibration/ece": 0.13618951612903227, "train_probe_calibration/mean_confidence": 0.7411290322580646, "train_probe_completions/clipped_ratio": 0.007638888888888917, "train_probe_completions/max_length": 3425.3333333333335, "train_probe_completions/max_terminated_length": 3425.3333333333335, "train_probe_completions/mean_length": 1026.7526245117188, "train_probe_completions/mean_terminated_length": 1034.6399434407551, "train_probe_completions/min_length": 45.5, "train_probe_completions/min_terminated_length": 240.16666666666666, "train_probe_loss": 0.0, "train_probe_num_tokens": 2390547457.0, "train_probe_reward": 1.3232409159342449, "train_probe_reward_std": 0.28327161570390064, "train_probe_rewards/accuracy_reward": 0.7725694378217062, "train_probe_rewards/brier_reward": 0.8782378733158112, "train_probe_rewards/confidence_one_or_zero": 0.0, "train_probe_rewards/format_reward": 0.995659738779068, "train_probe_rewards/mean_confidence_reward": 0.7427951196829478, "train_probe_runtime": 206.2309, "train_probe_samples_per_second": 4.849, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3403862814108531, "train_probe_signal/accuracy_reward/group_std_mean": 0.4144337127606074, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.17019314070542654, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.17019314070542654, "train_probe_signal/advantage_abs_mean": 0.22689850131670633, "train_probe_signal/advantage_pre_scale_abs_mean": 0.22689850131670633, "train_probe_signal/advantage_pre_scale_std": 0.28413822253545123, "train_probe_signal/advantage_std": 0.28413822253545123, "train_probe_signal/brier_reward/centered_abs_mean": 0.14635499566793442, "train_probe_signal/brier_reward/group_std_mean": 0.2045078327258428, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07317749783396721, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07317749783396721, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.0, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "train_probe_signal/format_reward/centered_abs_mean": 0.008409287935743729, "train_probe_signal/format_reward/group_std_mean": 0.02455231888840596, "train_probe_signal/format_reward/group_zero_std_frac": 0.8611111442248026, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.004204643967871864, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.004204643967871864, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.19124075770378113, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.23699325571457544, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.912407507613049e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.912407507613049e-06, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.10226115351796645, "calibration/batch_distribution_entropy": 0.4997267096116647, "calibration/confidence_entropy": 0.4167986168377104, "calibration/coverage@0%": 0.15158572500565368, "calibration/coverage@1%": 0.27033572500565367, "calibration/coverage@10%": 0.5625205052493438, "calibration/coverage@15%": 0.7192913385826772, "calibration/coverage@20%": 0.7380413385826772, "calibration/coverage@25%": 0.896628937007874, "calibration/coverage@30%": 0.9018372703412073, "calibration/coverage@5%": 0.5275811558150532, "calibration/ece": 0.121751312657189, "calibration/mean_confidence": 0.7879067315253595, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002951388888888906, "completions/max_length": 4065.6, "completions/max_terminated_length": 4065.6, "completions/mean_length": 1052.9957641601563, "completions/mean_terminated_length": 1056.2053466796874, "completions/min_length": 0.0, "completions/min_terminated_length": 234.6, "epoch": 2.175197810027375, "grad_norm": 0.0005023516132496297, "learning_rate": 8.11298076923077e-07, "loss": -0.0033, "num_tokens": 2405769072.0, "reward": 1.2994247198104858, "reward_std": 0.10124247223138809, "rewards/accuracy_reward": 0.7397569298744202, "rewards/brier_reward": 0.8620292901992798, "rewards/confidence_one_or_zero": 0.000260416668606922, "rewards/format_reward": 0.9970486164093018, "rewards/mean_confidence_reward": 0.7294904232025147, "signal/accuracy_reward/centered_abs_mean": 0.11355251967906951, "signal/accuracy_reward/group_std_mean": 0.15000397861003875, "signal/accuracy_reward/group_zero_std_frac": 0.5722222208976746, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05677625983953476, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05677625983953476, "signal/advantage_abs_mean": 0.07381217032670975, "signal/advantage_pre_scale_abs_mean": 0.07381217032670975, "signal/advantage_pre_scale_std": 0.15410908162593842, "signal/advantage_std": 0.15410908162593842, "signal/brier_reward/centered_abs_mean": 0.0708252176642418, "signal/brier_reward/group_std_mean": 0.0941934809088707, "signal/brier_reward/group_zero_std_frac": 0.20833333432674409, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0354126088321209, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.0354126088321209, "signal/confidence_one_or_zero/centered_abs_mean": 0.0004937065881676972, "signal/confidence_one_or_zero/group_std_mean": 0.001174198230728507, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.9370658672387435e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.9370658672387435e-09, "signal/format_reward/centered_abs_mean": 0.005327690939884633, "signal/format_reward/group_std_mean": 0.011196280270814896, "signal/format_reward/group_zero_std_frac": 0.95, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0026638454699423166, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0026638454699423166, "signal/mean_confidence_reward/centered_abs_mean": 0.05713010504841805, "signal/mean_confidence_reward/group_std_mean": 0.07585568577051163, "signal/mean_confidence_reward/group_zero_std_frac": 0.22499999701976775, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.713010182262224e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.713010182262224e-07, "step": 905 }, { "calibration/aurc": 0.08444802498630291, "calibration/batch_distribution_entropy": 0.5526607083680625, "calibration/confidence_entropy": 0.4382396864287301, "calibration/coverage@0%": 0.2, "calibration/coverage@1%": 0.4201643737470463, "calibration/coverage@10%": 0.6126101967754455, "calibration/coverage@15%": 0.7311524296394019, "calibration/coverage@20%": 0.8616819812628165, "calibration/coverage@25%": 0.9477806788511749, "calibration/coverage@30%": 0.974934725848564, "calibration/coverage@5%": 0.48849039316050896, "calibration/ece": 0.13770669609744382, "calibration/mean_confidence": 0.7573178965275529, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.002951388888888906, "completions/max_length": 4019.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 1051.80390625, "completions/mean_terminated_length": 1054.8413696289062, "completions/min_length": 0.0, "completions/min_terminated_length": 254.6, "epoch": 2.1871976600292498, "grad_norm": 0.00040610070573166013, "learning_rate": 7.8125e-07, "loss": -0.0014, "num_tokens": 2420979421.0, "reward": 1.3107996702194213, "reward_std": 0.09648575037717819, "rewards/accuracy_reward": 0.7501736044883728, "rewards/brier_reward": 0.8743626236915588, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9970486044883728, "rewards/mean_confidence_reward": 0.7258116245269776, "signal/accuracy_reward/centered_abs_mean": 0.09781900942325591, "signal/accuracy_reward/group_std_mean": 0.1354443058371544, "signal/accuracy_reward/group_zero_std_frac": 0.5861111283302307, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04890950471162796, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04890950471162796, "signal/advantage_abs_mean": 0.06833629086613655, "signal/advantage_pre_scale_abs_mean": 0.06833629086613655, "signal/advantage_pre_scale_std": 0.15040114521980286, "signal/advantage_std": 0.15040114521980286, "signal/brier_reward/centered_abs_mean": 0.0658588945865631, "signal/brier_reward/group_std_mean": 0.08932664394378662, "signal/brier_reward/group_zero_std_frac": 0.1916666656732559, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03292944729328155, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03292944729328155, "signal/confidence_one_or_zero/centered_abs_mean": 0.0, "signal/confidence_one_or_zero/group_std_mean": 0.0, "signal/confidence_one_or_zero/group_zero_std_frac": 1.0, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 0.0, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 0.0, "signal/format_reward/centered_abs_mean": 0.005284288187976926, "signal/format_reward/group_std_mean": 0.01141110472381115, "signal/format_reward/group_zero_std_frac": 0.9472222208976746, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.002642144093988463, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.002642144093988463, "signal/mean_confidence_reward/centered_abs_mean": 0.06070990487933159, "signal/mean_confidence_reward/group_std_mean": 0.08023345321416855, "signal/mean_confidence_reward/group_zero_std_frac": 0.1972222238779068, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.070990593798342e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.070990593798342e-07, "step": 910 }, { "calibration/aurc": 0.10431353530799983, "calibration/batch_distribution_entropy": 0.6268256149962863, "calibration/confidence_entropy": 0.44810718405882827, "calibration/coverage@0%": 0.07732051970307896, "calibration/coverage@1%": 0.07732051970307896, "calibration/coverage@10%": 0.5228921814048167, "calibration/coverage@15%": 0.6820303107904518, "calibration/coverage@20%": 0.8688432770189509, "calibration/coverage@25%": 0.9225349609722089, "calibration/coverage@30%": 0.9774869109947645, "calibration/coverage@5%": 0.3728251473855394, "calibration/ece": 0.09829693155019112, "calibration/mean_confidence": 0.7239969407417692, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006510416666666652, "completions/max_length": 4068.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 1067.3132080078126, "completions/mean_terminated_length": 1074.1466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 245.0, "epoch": 2.1991975100311247, "grad_norm": 0.0004353199037723243, "learning_rate": 7.512019230769231e-07, "loss": -0.0057, "num_tokens": 2436368405.0, "reward": 1.3187732458114625, "reward_std": 0.10341259986162185, "rewards/accuracy_reward": 0.7693576455116272, "rewards/brier_reward": 0.8746849060058594, "rewards/confidence_one_or_zero": 0.00034722223062999547, "rewards/format_reward": 0.9934895753860473, "rewards/mean_confidence_reward": 0.7203992962837219, "signal/accuracy_reward/centered_abs_mean": 0.10925021767616272, "signal/accuracy_reward/group_std_mean": 0.1492912322282791, "signal/accuracy_reward/group_zero_std_frac": 0.55277778506279, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05462510883808136, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05462510883808136, "signal/advantage_abs_mean": 0.07278931066393853, "signal/advantage_pre_scale_abs_mean": 0.07278931066393853, "signal/advantage_pre_scale_std": 0.15731181800365449, "signal/advantage_std": 0.15731181800365449, "signal/brier_reward/centered_abs_mean": 0.06981933489441872, "signal/brier_reward/group_std_mean": 0.09425708800554275, "signal/brier_reward/group_zero_std_frac": 0.23888889253139495, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03490966744720936, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03490966744720936, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006727430387400091, "signal/confidence_one_or_zero/group_std_mean": 0.0019641853868961334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9888888835906983, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/format_reward/centered_abs_mean": 0.01028103269636631, "signal/format_reward/group_std_mean": 0.01867804005742073, "signal/format_reward/group_zero_std_frac": 0.9222222447395325, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005140516348183155, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005140516348183155, "signal/mean_confidence_reward/centered_abs_mean": 0.05904840007424354, "signal/mean_confidence_reward/group_std_mean": 0.07917508035898209, "signal/mean_confidence_reward/group_zero_std_frac": 0.25277777910232546, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.904839497361536e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.904839497361536e-07, "step": 915 }, { "calibration/aurc": 0.08425741852707916, "calibration/batch_distribution_entropy": 0.5676297149156986, "calibration/confidence_entropy": 0.4450078776912066, "calibration/coverage@0%": 0.07724041005291006, "calibration/coverage@1%": 0.29389457671957675, "calibration/coverage@10%": 0.7548038352881042, "calibration/coverage@15%": 0.8388990987677346, "calibration/coverage@20%": 0.8654615987677345, "calibration/coverage@25%": 0.9019879260088137, "calibration/coverage@30%": 0.9019879260088137, "calibration/coverage@5%": 0.4305872977882769, "calibration/ece": 0.09431300861169814, "calibration/mean_confidence": 0.7255501247547903, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003732638888888884, "completions/max_length": 3997.4, "completions/max_terminated_length": 3997.4, "completions/mean_length": 1056.7252075195313, "completions/mean_terminated_length": 1060.7211669921876, "completions/min_length": 0.0, "completions/min_terminated_length": 212.8, "epoch": 2.2111973600329997, "grad_norm": 0.0004943942185491323, "learning_rate": 7.211538461538461e-07, "loss": -0.0034, "num_tokens": 2451617847.0, "reward": 1.327914309501648, "reward_std": 0.09883684813976287, "rewards/accuracy_reward": 0.7825520753860473, "rewards/brier_reward": 0.8769948005676269, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9962673664093018, "rewards/mean_confidence_reward": 0.7228992819786072, "signal/accuracy_reward/centered_abs_mean": 0.10398762971162796, "signal/accuracy_reward/group_std_mean": 0.14310045242309571, "signal/accuracy_reward/group_zero_std_frac": 0.5694444656372071, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05199381485581398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05199381485581398, "signal/advantage_abs_mean": 0.068203055113554, "signal/advantage_pre_scale_abs_mean": 0.068203055113554, "signal/advantage_pre_scale_std": 0.14753187894821168, "signal/advantage_std": 0.14753187894821168, "signal/brier_reward/centered_abs_mean": 0.06285782903432846, "signal/brier_reward/group_std_mean": 0.08784284293651581, "signal/brier_reward/group_zero_std_frac": 0.2416666716337204, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03142891451716423, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03142891451716423, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.006613498227670789, "signal/format_reward/group_std_mean": 0.014572407864034177, "signal/format_reward/group_zero_std_frac": 0.9305555701255799, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0033067491138353946, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0033067491138353946, "signal/mean_confidence_reward/centered_abs_mean": 0.058849284797906874, "signal/mean_confidence_reward/group_std_mean": 0.07798264026641846, "signal/mean_confidence_reward/group_zero_std_frac": 0.25000000596046446, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.884928327759553e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.884928327759553e-07, "step": 920 }, { "calibration/aurc": 0.06852912423046853, "calibration/batch_distribution_entropy": 0.4895546478305107, "calibration/confidence_entropy": 0.4122755266740642, "calibration/coverage@0%": 0.013037194589877834, "calibration/coverage@1%": 0.013037194589877834, "calibration/coverage@10%": 0.7213705279232112, "calibration/coverage@15%": 0.8847213132635252, "calibration/coverage@20%": 0.9833224258289703, "calibration/coverage@25%": 1.0, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.49637052792321124, "calibration/ece": 0.12446839550610815, "calibration/mean_confidence": 0.7847566263089006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00581597222222221, "completions/max_length": 4018.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1048.4292358398438, "completions/mean_terminated_length": 1054.6198608398438, "completions/min_length": 0.0, "completions/min_terminated_length": 249.6, "epoch": 2.2231972100348747, "grad_norm": 0.0005850245943292975, "learning_rate": 6.911057692307694e-07, "loss": -0.006, "num_tokens": 2466765256.0, "reward": 1.3188425302505493, "reward_std": 0.1013020858168602, "rewards/accuracy_reward": 0.7703993201255799, "rewards/brier_reward": 0.8730870485305786, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9941840171813965, "rewards/mean_confidence_reward": 0.7389010429382324, "signal/accuracy_reward/centered_abs_mean": 0.0968695729970932, "signal/accuracy_reward/group_std_mean": 0.1301523968577385, "signal/accuracy_reward/group_zero_std_frac": 0.6194444417953491, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0484347864985466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0484347864985466, "signal/advantage_abs_mean": 0.0729988969862461, "signal/advantage_pre_scale_abs_mean": 0.0729988969862461, "signal/advantage_pre_scale_std": 0.15467945337295533, "signal/advantage_std": 0.15467945337295533, "signal/brier_reward/centered_abs_mean": 0.07060423120856285, "signal/brier_reward/group_std_mean": 0.0961959645152092, "signal/brier_reward/group_zero_std_frac": 0.21111111342906952, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.035302115604281425, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.035302115604281425, "signal/confidence_one_or_zero/centered_abs_mean": 0.0003363715251907706, "signal/confidence_one_or_zero/group_std_mean": 0.0009820926934480667, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9944444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 3.3637149954302003e-09, "signal/format_reward/centered_abs_mean": 0.009228515718132258, "signal/format_reward/group_std_mean": 0.016490218043327332, "signal/format_reward/group_zero_std_frac": 0.9333333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004614257859066129, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004614257859066129, "signal/mean_confidence_reward/centered_abs_mean": 0.06064431369304657, "signal/mean_confidence_reward/group_std_mean": 0.08149511069059372, "signal/mean_confidence_reward/group_zero_std_frac": 0.22499999701976775, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.06443109063548e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.06443109063548e-07, "step": 925 }, { "calibration/aurc": 0.03595694218469277, "calibration/batch_distribution_entropy": 0.47561039074256894, "calibration/confidence_entropy": 0.4133971965590432, "calibration/coverage@0%": 0.2616409146726755, "calibration/coverage@1%": 0.36350758133934213, "calibration/coverage@10%": 0.9045208333333333, "calibration/coverage@15%": 0.9496249999999999, "calibration/coverage@20%": 0.9496249999999999, "calibration/coverage@25%": 0.9916666666666668, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.775001140062234, "calibration/ece": 0.12463192530431451, "calibration/mean_confidence": 0.757742563470916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009895833333333326, "completions/max_length": 4066.2, "completions/max_terminated_length": 4066.2, "completions/mean_length": 1057.5494018554687, "completions/mean_terminated_length": 1068.076025390625, "completions/min_length": 0.0, "completions/min_terminated_length": 263.2, "epoch": 2.2351970600367497, "grad_norm": 0.0005184356705285609, "learning_rate": 6.610576923076924e-07, "loss": -0.0121, "num_tokens": 2482045185.0, "reward": 1.3334663391113282, "reward_std": 0.10167869478464127, "rewards/accuracy_reward": 0.7933159708976746, "rewards/brier_reward": 0.8834973692893981, "rewards/confidence_one_or_zero": 0.000260416668606922, "rewards/format_reward": 0.9901041626930237, "rewards/mean_confidence_reward": 0.7570329785346985, "signal/accuracy_reward/centered_abs_mean": 0.09232313334941863, "signal/accuracy_reward/group_std_mean": 0.1276793286204338, "signal/accuracy_reward/group_zero_std_frac": 0.6111111164093017, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04616156667470932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04616156667470932, "signal/advantage_abs_mean": 0.07259206399321556, "signal/advantage_pre_scale_abs_mean": 0.07259206399321556, "signal/advantage_pre_scale_std": 0.16337981820106506, "signal/advantage_std": 0.16337981820106506, "signal/brier_reward/centered_abs_mean": 0.06510103866457939, "signal/brier_reward/group_std_mean": 0.0877988576889038, "signal/brier_reward/group_zero_std_frac": 0.2694444477558136, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.032550519332289696, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.032550519332289696, "signal/confidence_one_or_zero/centered_abs_mean": 0.0005045572877861559, "signal/confidence_one_or_zero/group_std_mean": 0.0014731390401721, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9916666626930237, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 5.0455724931453e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 5.0455724931453e-09, "signal/format_reward/centered_abs_mean": 0.01427951380610466, "signal/format_reward/group_std_mean": 0.021290403231978415, "signal/format_reward/group_zero_std_frac": 0.9277778029441833, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00713975690305233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00713975690305233, "signal/mean_confidence_reward/centered_abs_mean": 0.05263287052512169, "signal/mean_confidence_reward/group_std_mean": 0.07049989104270935, "signal/mean_confidence_reward/group_zero_std_frac": 0.2805555582046509, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.263286766421515e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.263286766421515e-07, "step": 930 }, { "calibration/aurc": 0.06613032927649383, "calibration/batch_distribution_entropy": 0.5231070650794793, "calibration/confidence_entropy": 0.4251756356619959, "calibration/coverage@0%": 0.135083198941607, "calibration/coverage@1%": 0.2516425105834844, "calibration/coverage@10%": 0.7676960956102408, "calibration/coverage@15%": 0.8742506409010545, "calibration/coverage@20%": 0.9602369056307192, "calibration/coverage@25%": 0.977127659574468, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.5743170220691061, "calibration/ece": 0.11429494944302047, "calibration/mean_confidence": 0.7506295174555457, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009895833333333326, "completions/max_length": 4069.8, "completions/max_terminated_length": 4069.8, "completions/mean_length": 1159.37255859375, "completions/mean_terminated_length": 1170.6749755859375, "completions/min_length": 0.0, "completions/min_terminated_length": 256.2, "epoch": 2.2471969100386247, "grad_norm": 0.00045156179112382233, "learning_rate": 6.310096153846154e-07, "loss": -0.0137, "num_tokens": 2498484005.0, "reward": 1.310149621963501, "reward_std": 0.11238724738359451, "rewards/accuracy_reward": 0.7602430582046509, "rewards/brier_reward": 0.8699374318122863, "rewards/confidence_one_or_zero": 0.0013020833779592068, "rewards/format_reward": 0.9901041746139526, "rewards/mean_confidence_reward": 0.7356119632720948, "signal/accuracy_reward/centered_abs_mean": 0.1013346329331398, "signal/accuracy_reward/group_std_mean": 0.1445162922143936, "signal/accuracy_reward/group_zero_std_frac": 0.5500000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0506673164665699, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0506673164665699, "signal/advantage_abs_mean": 0.07715610265731812, "signal/advantage_pre_scale_abs_mean": 0.07715610265731812, "signal/advantage_pre_scale_std": 0.1692074030637741, "signal/advantage_std": 0.1692074030637741, "signal/brier_reward/centered_abs_mean": 0.07055672556161881, "signal/brier_reward/group_std_mean": 0.09766194820404053, "signal/brier_reward/group_zero_std_frac": 0.18888889253139496, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.035278362780809404, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.035278362780809404, "signal/confidence_one_or_zero/centered_abs_mean": 0.002001953113358468, "signal/confidence_one_or_zero/group_std_mean": 0.0031128528527915476, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9888888835906983, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 2.0019530921899786e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 2.0019530921899786e-08, "signal/format_reward/centered_abs_mean": 0.015288628824055194, "signal/format_reward/group_std_mean": 0.02638211064040661, "signal/format_reward/group_zero_std_frac": 0.9027777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007644314412027597, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007644314412027597, "signal/mean_confidence_reward/centered_abs_mean": 0.05848725512623787, "signal/mean_confidence_reward/group_std_mean": 0.07705972194671631, "signal/mean_confidence_reward/group_zero_std_frac": 0.21111111342906952, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.848725436408131e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.848725436408131e-07, "step": 935 }, { "calibration/aurc": 0.11815132518268807, "calibration/batch_distribution_entropy": 0.47226450413406784, "calibration/confidence_entropy": 0.40932747557033017, "calibration/coverage@0%": 0.042656759807269136, "calibration/coverage@1%": 0.1442274404355414, "calibration/coverage@10%": 0.5340780076257683, "calibration/coverage@15%": 0.5575727720236741, "calibration/coverage@20%": 0.577885272023674, "calibration/coverage@25%": 0.877038043478261, "calibration/coverage@30%": 0.9627264492753623, "calibration/coverage@5%": 0.47934415073222547, "calibration/ece": 0.10968769680931786, "calibration/mean_confidence": 0.7737516764359967, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005815972222222232, "completions/max_length": 4060.2, "completions/max_terminated_length": 4060.2, "completions/mean_length": 1116.0426391601563, "completions/mean_terminated_length": 1122.5079467773437, "completions/min_length": 0.0, "completions/min_terminated_length": 229.8, "epoch": 2.2591967600404996, "grad_norm": 0.0005417557549662888, "learning_rate": 6.009615384615385e-07, "loss": -0.0076, "num_tokens": 2514427120.0, "reward": 1.3313212394714355, "reward_std": 0.10192690193653106, "rewards/accuracy_reward": 0.7858506917953492, "rewards/brier_reward": 0.8825925946235657, "rewards/confidence_one_or_zero": 0.0003472222248092294, "rewards/format_reward": 0.9941840291023254, "rewards/mean_confidence_reward": 0.7544522404670715, "signal/accuracy_reward/centered_abs_mean": 0.10623372346162796, "signal/accuracy_reward/group_std_mean": 0.13585321754217147, "signal/accuracy_reward/group_zero_std_frac": 0.6250000238418579, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05311686173081398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05311686173081398, "signal/advantage_abs_mean": 0.0760264627635479, "signal/advantage_pre_scale_abs_mean": 0.0760264627635479, "signal/advantage_pre_scale_std": 0.1651286870241165, "signal/advantage_std": 0.1651286870241165, "signal/brier_reward/centered_abs_mean": 0.06428858116269112, "signal/brier_reward/group_std_mean": 0.08706333488225937, "signal/brier_reward/group_zero_std_frac": 0.20000000298023224, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03214429058134556, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03214429058134556, "signal/confidence_one_or_zero/centered_abs_mean": 0.0006727430503815412, "signal/confidence_one_or_zero/group_std_mean": 0.0019641853868961334, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9888888835906983, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.7274299908604006e-09, "signal/format_reward/centered_abs_mean": 0.009564887080341578, "signal/format_reward/group_std_mean": 0.017693409509956837, "signal/format_reward/group_zero_std_frac": 0.9277777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004782443540170789, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004782443540170789, "signal/mean_confidence_reward/centered_abs_mean": 0.05851834490895271, "signal/mean_confidence_reward/group_std_mean": 0.07663241773843765, "signal/mean_confidence_reward/group_zero_std_frac": 0.21388888955116273, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.851834202985628e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.851834202985628e-07, "step": 940 }, { "calibration/aurc": 0.08826813575673205, "calibration/batch_distribution_entropy": 0.48971091718560816, "calibration/confidence_entropy": 0.4106930706073067, "calibration/coverage@0%": 0.030750981675392676, "calibration/coverage@1%": 0.030750981675392676, "calibration/coverage@10%": 0.5146051483420593, "calibration/coverage@15%": 0.847344591509208, "calibration/coverage@20%": 0.9503361545047511, "calibration/coverage@25%": 0.9711229946524064, "calibration/coverage@30%": 0.9711229946524064, "calibration/coverage@5%": 0.44377181500872603, "calibration/ece": 0.08523833226109197, "calibration/mean_confidence": 0.7546494462414952, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00894097222222221, "completions/max_length": 4038.6, "completions/max_terminated_length": 4038.6, "completions/mean_length": 1149.3369140625, "completions/mean_terminated_length": 1159.815283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 242.8, "epoch": 2.2711966100423746, "grad_norm": 0.0005725827650167048, "learning_rate": 5.709134615384615e-07, "loss": -0.0081, "num_tokens": 2530789209.0, "reward": 1.3042078971862794, "reward_std": 0.11931036859750747, "rewards/accuracy_reward": 0.7649305582046508, "rewards/brier_reward": 0.8524109721183777, "rewards/confidence_one_or_zero": 0.0026909722131676973, "rewards/format_reward": 0.9910590410232544, "rewards/mean_confidence_reward": 0.7614357471466064, "signal/accuracy_reward/centered_abs_mean": 0.10575086772441863, "signal/accuracy_reward/group_std_mean": 0.14737944900989533, "signal/accuracy_reward/group_zero_std_frac": 0.5583333373069763, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.05287543386220932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.05287543386220932, "signal/advantage_abs_mean": 0.08349088877439499, "signal/advantage_pre_scale_abs_mean": 0.08349088877439499, "signal/advantage_pre_scale_std": 0.17720816731452943, "signal/advantage_std": 0.17720816731452943, "signal/brier_reward/centered_abs_mean": 0.07384676486253738, "signal/brier_reward/group_std_mean": 0.10237649828195572, "signal/brier_reward/group_zero_std_frac": 0.21666666865348816, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03692338243126869, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03692338243126869, "signal/confidence_one_or_zero/centered_abs_mean": 0.004248046805150807, "signal/confidence_one_or_zero/group_std_mean": 0.006634887075051665, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9777777671813965, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.2480467854488777e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.2480467854488777e-08, "signal/format_reward/centered_abs_mean": 0.01201714426279068, "signal/format_reward/group_std_mean": 0.020848624408245087, "signal/format_reward/group_zero_std_frac": 0.9222222447395325, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00600857213139534, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00600857213139534, "signal/mean_confidence_reward/centered_abs_mean": 0.05797336474061012, "signal/mean_confidence_reward/group_std_mean": 0.07812755703926086, "signal/mean_confidence_reward/group_zero_std_frac": 0.2444444477558136, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.797336143587017e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.797336143587017e-07, "step": 945 }, { "calibration/aurc": 0.10666461228569446, "calibration/batch_distribution_entropy": 0.5129890157519312, "calibration/confidence_entropy": 0.43170988871009053, "calibration/coverage@0%": 0.017342544465721157, "calibration/coverage@1%": 0.12547902740535372, "calibration/coverage@10%": 0.5397160984704479, "calibration/coverage@15%": 0.6344732899058854, "calibration/coverage@20%": 0.8398567212012349, "calibration/coverage@25%": 0.9215911959311681, "calibration/coverage@30%": 0.9756613756613757, "calibration/coverage@5%": 0.37705233487594947, "calibration/ece": 0.12943357735780653, "calibration/mean_confidence": 0.7698767704864158, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009635416666666674, "completions/max_length": 4053.4, "completions/max_terminated_length": 4053.4, "completions/mean_length": 1193.9262939453124, "completions/mean_terminated_length": 1205.6461181640625, "completions/min_length": 0.0, "completions/min_terminated_length": 232.4, "epoch": 2.2831964600442496, "grad_norm": 0.0004705480532720685, "learning_rate": 5.408653846153847e-07, "loss": -0.012, "num_tokens": 2547668552.0, "reward": 1.300801730155945, "reward_std": 0.11203119456768036, "rewards/accuracy_reward": 0.7486111044883728, "rewards/brier_reward": 0.8625257134437561, "rewards/confidence_one_or_zero": 0.0013020833488553763, "rewards/format_reward": 0.9904513835906983, "rewards/mean_confidence_reward": 0.7579123258590699, "signal/accuracy_reward/centered_abs_mean": 0.09639757126569748, "signal/accuracy_reward/group_std_mean": 0.1354551613330841, "signal/accuracy_reward/group_zero_std_frac": 0.5805555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04819878563284874, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04819878563284874, "signal/advantage_abs_mean": 0.07755091041326523, "signal/advantage_pre_scale_abs_mean": 0.07755091041326523, "signal/advantage_pre_scale_std": 0.1750474601984024, "signal/advantage_std": 0.1750474601984024, "signal/brier_reward/centered_abs_mean": 0.06815914735198021, "signal/brier_reward/group_std_mean": 0.09561873078346253, "signal/brier_reward/group_zero_std_frac": 0.2222222238779068, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.034079573675990106, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.034079573675990106, "signal/confidence_one_or_zero/centered_abs_mean": 0.0023383245803415774, "signal/confidence_one_or_zero/group_std_mean": 0.004335219971835613, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9833333253860473, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 2.3383245206787252e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 2.3383245206787252e-08, "signal/format_reward/centered_abs_mean": 0.015462239645421505, "signal/format_reward/group_std_mean": 0.025520271994173525, "signal/format_reward/group_zero_std_frac": 0.9083333492279053, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007731119822710753, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007731119822710753, "signal/mean_confidence_reward/centered_abs_mean": 0.054386617988348006, "signal/mean_confidence_reward/group_std_mean": 0.0734786033630371, "signal/mean_confidence_reward/group_zero_std_frac": 0.2472222238779068, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.438661730750028e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.438661730750028e-07, "step": 950 }, { "epoch": 2.2831964600442496, "eval_calibration/aurc": 0.07995127840902617, "eval_calibration/batch_distribution_entropy": 0.4943516679515538, "eval_calibration/confidence_entropy": 0.42279412328987903, "eval_calibration/coverage@0%": 0.338877688172043, "eval_calibration/coverage@1%": 0.338877688172043, "eval_calibration/coverage@10%": 0.6461693548387096, "eval_calibration/coverage@15%": 0.7034610215053764, "eval_calibration/coverage@20%": 0.8938172043010754, "eval_calibration/coverage@25%": 0.989247311827957, "eval_calibration/coverage@30%": 1.0, "eval_calibration/coverage@5%": 0.604502688172043, "eval_calibration/ece": 0.12303427419354847, "eval_calibration/mean_confidence": 0.772429435483871, "eval_completions/clipped_ratio": 0.010416666666666666, "eval_completions/max_length": 3602.3333333333335, "eval_completions/max_terminated_length": 3602.3333333333335, "eval_completions/mean_length": 1098.920878092448, "eval_completions/mean_terminated_length": 1110.6353556315105, "eval_completions/min_length": 108.16666666666667, "eval_completions/min_terminated_length": 276.8333333333333, "eval_loss": 0.0, "eval_num_tokens": 2547668552.0, "eval_reward": 1.2866131067276, "eval_reward_std": 0.32823309302330017, "eval_rewards/accuracy_reward": 0.7317708333333334, "eval_rewards/brier_reward": 0.8518568972746531, "eval_rewards/confidence_one_or_zero": 0.0034722223257025084, "eval_rewards/format_reward": 0.9895833333333334, "eval_rewards/mean_confidence_reward": 0.7577690581480662, "eval_runtime": 208.4431, "eval_samples_per_second": 4.797, "eval_signal/accuracy_reward/centered_abs_mean": 0.3831922709941864, "eval_signal/accuracy_reward/group_std_mean": 0.44314516087373096, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1915961354970932, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.1915961354970932, "eval_signal/advantage_abs_mean": 0.2712271610895793, "eval_signal/advantage_pre_scale_abs_mean": 0.2712271610895793, "eval_signal/advantage_pre_scale_std": 0.3273568203051885, "eval_signal/advantage_std": 0.3273568203051885, "eval_signal/brier_reward/centered_abs_mean": 0.1798002893726031, "eval_signal/brier_reward/group_std_mean": 0.2398556446035703, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08990014468630154, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.08990014468630154, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.006727430348594983, "eval_signal/confidence_one_or_zero/group_std_mean": 0.019641855110724766, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 0.8888889054457346, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.727429990860401e-08, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.727429990860401e-08, "eval_signal/format_reward/centered_abs_mean": 0.02007378451526165, "eval_signal/format_reward/group_std_mean": 0.05593615584075451, "eval_signal/format_reward/group_zero_std_frac": 0.6944444527228674, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.010036892257630825, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.010036892257630825, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.1813796410957972, "eval_signal/mean_confidence_reward/group_std_mean": 0.22990665833155313, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.8137963593289896e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.8137963593289896e-06, "eval_steps_per_second": 0.029, "step": 950 }, { "epoch": 2.2831964600442496, "step": 950, "train_probe_calibration/aurc": 0.09415443116993773, "train_probe_calibration/batch_distribution_entropy": 0.4993292690332749, "train_probe_calibration/confidence_entropy": 0.4249416370806662, "train_probe_calibration/coverage@0%": 0.020833333333333332, "train_probe_calibration/coverage@1%": 0.020833333333333332, "train_probe_calibration/coverage@10%": 0.6875, "train_probe_calibration/coverage@15%": 0.8072916666666666, "train_probe_calibration/coverage@20%": 0.875, "train_probe_calibration/coverage@25%": 0.9583333333333334, "train_probe_calibration/coverage@30%": 1.0, "train_probe_calibration/coverage@5%": 0.4583333333333333, "train_probe_calibration/ece": 0.1086458333333334, "train_probe_calibration/mean_confidence": 0.7653125, "train_probe_completions/clipped_ratio": 0.005208333333333333, "train_probe_completions/max_length": 3533.6666666666665, "train_probe_completions/max_terminated_length": 3533.6666666666665, "train_probe_completions/mean_length": 1143.7672322591145, "train_probe_completions/mean_terminated_length": 1149.722900390625, "train_probe_completions/min_length": 83.66666666666667, "train_probe_completions/min_terminated_length": 236.33333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 2547668552.0, "train_probe_reward": 1.3337283929189045, "train_probe_reward_std": 0.29232698182264966, "train_probe_rewards/accuracy_reward": 0.793402781089147, "train_probe_rewards/brier_reward": 0.8801149626572927, "train_probe_rewards/confidence_one_or_zero": 0.0052083334885537624, "train_probe_rewards/format_reward": 0.9939236144224802, "train_probe_rewards/mean_confidence_reward": 0.7691319088141123, "train_probe_runtime": 210.9931, "train_probe_samples_per_second": 4.739, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3221571197112401, "train_probe_signal/accuracy_reward/group_std_mean": 0.4056045860052109, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.16107855985562006, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.16107855985562006, "train_probe_signal/advantage_abs_mean": 0.22466853260993958, "train_probe_signal/advantage_pre_scale_abs_mean": 0.22466853260993958, "train_probe_signal/advantage_pre_scale_std": 0.2915739019711812, "train_probe_signal/advantage_std": 0.2915739019711812, "train_probe_signal/brier_reward/centered_abs_mean": 0.15078351646661758, "train_probe_signal/brier_reward/group_std_mean": 0.21451808760563532, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07539175823330879, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07539175823330879, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.009982638681928316, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.026473373795549076, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 0.8611111342906952, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 9.982638478807833e-08, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 9.982638478807833e-08, "train_probe_signal/format_reward/centered_abs_mean": 0.011773003110041222, "train_probe_signal/format_reward/group_std_mean": 0.034373246443768345, "train_probe_signal/format_reward/group_zero_std_frac": 0.8055555820465088, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.005886501555020611, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.005886501555020611, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.17283960431814194, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.22278130302826563, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.728396019492114e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.728396019492114e-06, "train_probe_steps_per_second": 0.028 }, { "calibration/aurc": 0.12427024281003125, "calibration/batch_distribution_entropy": 0.5652110855623905, "calibration/confidence_entropy": 0.44264124409718997, "calibration/coverage@0%": 0.011486890774586596, "calibration/coverage@1%": 0.011486890774586596, "calibration/coverage@10%": 0.48930872521094837, "calibration/coverage@15%": 0.6250341734216306, "calibration/coverage@20%": 0.853759471719336, "calibration/coverage@25%": 0.9188437347148259, "calibration/coverage@30%": 0.9562005277044854, "calibration/coverage@5%": 0.2936977864965117, "calibration/ece": 0.09334695678333663, "calibration/mean_confidence": 0.7407820153686983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009722222222222233, "completions/max_length": 4068.2, "completions/max_terminated_length": 4068.2, "completions/mean_length": 1101.2449462890625, "completions/mean_terminated_length": 1112.2766235351562, "completions/min_length": 0.0, "completions/min_terminated_length": 245.8, "epoch": 2.2951963100461246, "grad_norm": 0.00044039334170520306, "learning_rate": 5.108173076923077e-07, "loss": -0.0108, "num_tokens": 2563431534.0, "reward": 1.326154351234436, "reward_std": 0.10454436242580414, "rewards/accuracy_reward": 0.7868923544883728, "rewards/brier_reward": 0.8751229405403137, "rewards/confidence_one_or_zero": 0.003038194449618459, "rewards/format_reward": 0.9902777791023254, "rewards/mean_confidence_reward": 0.7743185639381409, "signal/accuracy_reward/centered_abs_mean": 0.087353515625, "signal/accuracy_reward/group_std_mean": 0.12566689252853394, "signal/accuracy_reward/group_zero_std_frac": 0.594444465637207, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0436767578125, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0436767578125, "signal/advantage_abs_mean": 0.07220623940229416, "signal/advantage_pre_scale_abs_mean": 0.07220623940229416, "signal/advantage_pre_scale_std": 0.16324632465839387, "signal/advantage_std": 0.16324632465839387, "signal/brier_reward/centered_abs_mean": 0.06715752929449081, "signal/brier_reward/group_std_mean": 0.09402596056461335, "signal/brier_reward/group_zero_std_frac": 0.175, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03357876464724541, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03357876464724541, "signal/confidence_one_or_zero/centered_abs_mean": 0.004953341977670789, "signal/confidence_one_or_zero/group_std_mean": 0.009567244164645671, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9583333253860473, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.953342198632527e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.953342198632527e-08, "signal/format_reward/centered_abs_mean": 0.014029948133975267, "signal/format_reward/group_std_mean": 0.02410128153860569, "signal/format_reward/group_zero_std_frac": 0.9055555582046508, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0070149740669876335, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0070149740669876335, "signal/mean_confidence_reward/centered_abs_mean": 0.054025448858737946, "signal/mean_confidence_reward/group_std_mean": 0.07329589575529098, "signal/mean_confidence_reward/group_zero_std_frac": 0.18888888955116273, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.402544786647923e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.402544786647923e-07, "step": 955 }, { "calibration/aurc": 0.10259732345912202, "calibration/batch_distribution_entropy": 0.5282419338415619, "calibration/confidence_entropy": 0.42012982319340536, "calibration/coverage@0%": 0.03606084091117848, "calibration/coverage@1%": 0.15711347249012583, "calibration/coverage@10%": 0.6745257531918802, "calibration/coverage@15%": 0.6938678584550381, "calibration/coverage@20%": 0.7492461479287224, "calibration/coverage@25%": 0.8890652268760908, "calibration/coverage@30%": 0.9340314136125654, "calibration/coverage@5%": 0.5972176391567925, "calibration/ece": 0.11567156356204644, "calibration/mean_confidence": 0.7513238897308718, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005468750000000022, "completions/max_length": 3994.4, "completions/max_terminated_length": 3994.4, "completions/mean_length": 1177.2173095703124, "completions/mean_terminated_length": 1183.827001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 278.4, "epoch": 2.3071961600479995, "grad_norm": 0.00044149364111945033, "learning_rate": 4.807692307692308e-07, "loss": -0.006, "num_tokens": 2580074485.0, "reward": 1.3156062364578247, "reward_std": 0.1039289191365242, "rewards/accuracy_reward": 0.7618923544883728, "rewards/brier_reward": 0.874773895740509, "rewards/confidence_one_or_zero": 0.0026041667500976474, "rewards/format_reward": 0.99453125, "rewards/mean_confidence_reward": 0.7449097275733948, "signal/accuracy_reward/centered_abs_mean": 0.1010687917470932, "signal/accuracy_reward/group_std_mean": 0.139707949757576, "signal/accuracy_reward/group_zero_std_frac": 0.5805555641651153, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0505343958735466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0505343958735466, "signal/advantage_abs_mean": 0.07287612706422805, "signal/advantage_pre_scale_abs_mean": 0.07287612706422805, "signal/advantage_pre_scale_std": 0.15980434715747832, "signal/advantage_std": 0.15980434715747832, "signal/brier_reward/centered_abs_mean": 0.06562938019633294, "signal/brier_reward/group_std_mean": 0.09041293561458588, "signal/brier_reward/group_zero_std_frac": 0.1888888880610466, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03281469009816647, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03281469009816647, "signal/confidence_one_or_zero/centered_abs_mean": 0.004394531238358468, "signal/confidence_one_or_zero/group_std_mean": 0.007149875164031982, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9749999880790711, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.394531103457666e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.394531103457666e-08, "signal/format_reward/centered_abs_mean": 0.009727647644467652, "signal/format_reward/group_std_mean": 0.018769985437393187, "signal/format_reward/group_zero_std_frac": 0.9222222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004863823822233826, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004863823822233826, "signal/mean_confidence_reward/centered_abs_mean": 0.057137486338615415, "signal/mean_confidence_reward/group_std_mean": 0.0768507570028305, "signal/mean_confidence_reward/group_zero_std_frac": 0.20000000149011612, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.713748578273226e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.713748578273226e-07, "step": 960 }, { "calibration/aurc": 0.13175842294663376, "calibration/batch_distribution_entropy": 0.4615743579268333, "calibration/confidence_entropy": 0.40897898252906567, "calibration/coverage@0%": 0.16241601446018633, "calibration/coverage@1%": 0.16241601446018633, "calibration/coverage@10%": 0.6641173654211074, "calibration/coverage@15%": 0.7294682078619089, "calibration/coverage@20%": 0.7704685233536408, "calibration/coverage@25%": 0.7892674789672178, "calibration/coverage@30%": 0.8038888888888888, "calibration/coverage@5%": 0.4638931319465251, "calibration/ece": 0.14195280236259128, "calibration/mean_confidence": 0.7646063181365836, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009809027777777767, "completions/max_length": 4045.2, "completions/max_terminated_length": 4045.2, "completions/mean_length": 1105.3580688476563, "completions/mean_terminated_length": 1116.5384521484375, "completions/min_length": 54.8, "completions/min_terminated_length": 231.2, "epoch": 2.3191960100498745, "grad_norm": 0.0005523549625650048, "learning_rate": 4.507211538461539e-07, "loss": -0.0127, "num_tokens": 2595886226.0, "reward": 1.2976704120635987, "reward_std": 0.10254855304956437, "rewards/accuracy_reward": 0.7459201335906982, "rewards/brier_reward": 0.8592142105102539, "rewards/confidence_one_or_zero": 0.00598958320915699, "rewards/format_reward": 0.9901909708976746, "rewards/mean_confidence_reward": 0.7719730854034423, "signal/accuracy_reward/centered_abs_mean": 0.09232313483953476, "signal/accuracy_reward/group_std_mean": 0.1237046867609024, "signal/accuracy_reward/group_zero_std_frac": 0.6416666626930236, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04616156741976738, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04616156741976738, "signal/advantage_abs_mean": 0.07487332969903945, "signal/advantage_pre_scale_abs_mean": 0.07487332969903945, "signal/advantage_pre_scale_std": 0.16851918399333954, "signal/advantage_std": 0.16851918399333954, "signal/brier_reward/centered_abs_mean": 0.06501694023609161, "signal/brier_reward/group_std_mean": 0.08838980495929719, "signal/brier_reward/group_zero_std_frac": 0.21666666269302368, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03250847011804581, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03250847011804581, "signal/confidence_one_or_zero/centered_abs_mean": 0.00777452252805233, "signal/confidence_one_or_zero/group_std_mean": 0.012440861202776432, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9555555701255798, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 7.774522430281649e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 7.774522430281649e-08, "signal/format_reward/centered_abs_mean": 0.01432834193110466, "signal/format_reward/group_std_mean": 0.02126956470310688, "signal/format_reward/group_zero_std_frac": 0.9305555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.00716417096555233, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00716417096555233, "signal/mean_confidence_reward/centered_abs_mean": 0.05071159824728966, "signal/mean_confidence_reward/group_std_mean": 0.06935789436101913, "signal/mean_confidence_reward/group_zero_std_frac": 0.23888888955116272, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.071159648650792e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.071159648650792e-07, "step": 965 }, { "calibration/aurc": 0.059198353555072816, "calibration/batch_distribution_entropy": 0.4430891795981953, "calibration/confidence_entropy": 0.39717231210996445, "calibration/coverage@0%": 0.035416666666666666, "calibration/coverage@1%": 0.1640625, "calibration/coverage@10%": 0.7171875000000001, "calibration/coverage@15%": 0.9390625, "calibration/coverage@20%": 0.9776041666666668, "calibration/coverage@25%": 1.0, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.6213541666666667, "calibration/ece": 0.09165104166666657, "calibration/mean_confidence": 0.8055260416666667, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00277777777777779, "completions/max_length": 3958.4, "completions/max_terminated_length": 3958.4, "completions/mean_length": 1072.6743774414062, "completions/mean_terminated_length": 1075.72587890625, "completions/min_length": 0.0, "completions/min_terminated_length": 221.2, "epoch": 2.3311958600517495, "grad_norm": 0.00043361177085898817, "learning_rate": 4.20673076923077e-07, "loss": -0.0026, "num_tokens": 2611380075.0, "reward": 1.321795654296875, "reward_std": 0.0953572928905487, "rewards/accuracy_reward": 0.770399296283722, "rewards/brier_reward": 0.8759540915489197, "rewards/confidence_one_or_zero": 0.004340277868323028, "rewards/format_reward": 0.9972222328186036, "rewards/mean_confidence_reward": 0.7821171760559082, "signal/accuracy_reward/centered_abs_mean": 0.08816731721162796, "signal/accuracy_reward/group_std_mean": 0.11881137639284134, "signal/accuracy_reward/group_zero_std_frac": 0.6472222208976746, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04408365860581398, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04408365860581398, "signal/advantage_abs_mean": 0.06967429220676422, "signal/advantage_pre_scale_abs_mean": 0.06967429220676422, "signal/advantage_pre_scale_std": 0.15514695644378662, "signal/advantage_std": 0.15514695644378662, "signal/brier_reward/centered_abs_mean": 0.06540617346763611, "signal/brier_reward/group_std_mean": 0.08769682794809341, "signal/brier_reward/group_zero_std_frac": 0.18611111044883727, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.032703086733818054, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.032703086733818054, "signal/confidence_one_or_zero/centered_abs_mean": 0.00684678815305233, "signal/confidence_one_or_zero/group_std_mean": 0.012119800969958305, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9527777910232544, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.846787954373213e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.846787954373213e-08, "signal/format_reward/centered_abs_mean": 0.0050672741956077514, "signal/format_reward/group_std_mean": 0.010408315248787403, "signal/format_reward/group_zero_std_frac": 0.9555555582046509, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0025336370978038757, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0025336370978038757, "signal/mean_confidence_reward/centered_abs_mean": 0.049856779724359514, "signal/mean_confidence_reward/group_std_mean": 0.0677779644727707, "signal/mean_confidence_reward/group_zero_std_frac": 0.19722222089767455, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.985677890090301e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.985677890090301e-07, "step": 970 }, { "calibration/aurc": 0.14674736436120162, "calibration/batch_distribution_entropy": 0.4448055923218175, "calibration/confidence_entropy": 0.4059105911449127, "calibration/coverage@0%": 0.04994810108015442, "calibration/coverage@1%": 0.04994810108015442, "calibration/coverage@10%": 0.3267353817819088, "calibration/coverage@15%": 0.5469437151152421, "calibration/coverage@20%": 0.5799700309047159, "calibration/coverage@25%": 0.8511285776521188, "calibration/coverage@30%": 0.9220020378984793, "calibration/coverage@5%": 0.2327606010801544, "calibration/ece": 0.12046806631478568, "calibration/mean_confidence": 0.7986770927443603, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009722222222222188, "completions/max_length": 4007.8, "completions/max_terminated_length": 4007.8, "completions/mean_length": 1117.666259765625, "completions/mean_terminated_length": 1128.6537841796876, "completions/min_length": 0.0, "completions/min_terminated_length": 223.4, "epoch": 2.3431957100536245, "grad_norm": 0.0005584514583460987, "learning_rate": 3.90625e-07, "loss": -0.0134, "num_tokens": 2627361542.0, "reward": 1.3165238618850708, "reward_std": 0.10956877171993255, "rewards/accuracy_reward": 0.7708333373069763, "rewards/brier_reward": 0.8719209671020508, "rewards/confidence_one_or_zero": 0.00477430559694767, "rewards/format_reward": 0.9902777791023254, "rewards/mean_confidence_reward": 0.7796032905578614, "signal/accuracy_reward/centered_abs_mean": 0.09867621511220932, "signal/accuracy_reward/group_std_mean": 0.13598748594522475, "signal/accuracy_reward/group_zero_std_frac": 0.5888888835906982, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04933810755610466, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04933810755610466, "signal/advantage_abs_mean": 0.07737244740128517, "signal/advantage_pre_scale_abs_mean": 0.07737244740128517, "signal/advantage_pre_scale_std": 0.17126381993293763, "signal/advantage_std": 0.17126381993293763, "signal/brier_reward/centered_abs_mean": 0.07085508033633232, "signal/brier_reward/group_std_mean": 0.09679379016160965, "signal/brier_reward/group_zero_std_frac": 0.23333334028720856, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03542754016816616, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03542754016816616, "signal/confidence_one_or_zero/centered_abs_mean": 0.006060112919658423, "signal/confidence_one_or_zero/group_std_mean": 0.010480996314436197, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9611111044883728, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.060112625050352e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.060112625050352e-08, "signal/format_reward/centered_abs_mean": 0.013433159794658422, "signal/format_reward/group_std_mean": 0.022467734292149544, "signal/format_reward/group_zero_std_frac": 0.9138888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006716579897329211, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006716579897329211, "signal/mean_confidence_reward/centered_abs_mean": 0.05351639166474342, "signal/mean_confidence_reward/group_std_mean": 0.07240484282374382, "signal/mean_confidence_reward/group_zero_std_frac": 0.2583333343267441, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.351639288164734e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.351639288164734e-07, "step": 975 }, { "calibration/aurc": 0.1037124169781116, "calibration/batch_distribution_entropy": 0.3998346456350418, "calibration/confidence_entropy": 0.3934008891732773, "calibration/coverage@0%": 0.019389381832266947, "calibration/coverage@1%": 0.1562040032421886, "calibration/coverage@10%": 0.5584002064623801, "calibration/coverage@15%": 0.76452967044922, "calibration/coverage@20%": 0.781753663003663, "calibration/coverage@25%": 0.7954899267399268, "calibration/coverage@30%": 0.9338541666666668, "calibration/coverage@5%": 0.5187937552004132, "calibration/ece": 0.10378426579132354, "calibration/mean_confidence": 0.8088986243711686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006770833333333326, "completions/max_length": 4022.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 1120.2183349609375, "completions/mean_terminated_length": 1127.75498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 250.4, "epoch": 2.3551955600554995, "grad_norm": 0.0005596231785602868, "learning_rate": 3.6057692307692306e-07, "loss": -0.0063, "num_tokens": 2643374073.0, "reward": 1.300755763053894, "reward_std": 0.11279069930315018, "rewards/accuracy_reward": 0.7458333373069763, "rewards/brier_reward": 0.8624338150024414, "rewards/confidence_one_or_zero": 0.004774305620230734, "rewards/format_reward": 0.9932291626930236, "rewards/mean_confidence_reward": 0.761440098285675, "signal/accuracy_reward/centered_abs_mean": 0.103125, "signal/accuracy_reward/group_std_mean": 0.143071149289608, "signal/accuracy_reward/group_zero_std_frac": 0.5666666746139526, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0515625, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0515625, "signal/advantage_abs_mean": 0.08098347783088684, "signal/advantage_pre_scale_abs_mean": 0.08098347783088684, "signal/advantage_pre_scale_std": 0.1708381861448288, "signal/advantage_std": 0.1708381861448288, "signal/brier_reward/centered_abs_mean": 0.07258971482515335, "signal/brier_reward/group_std_mean": 0.09895318895578384, "signal/brier_reward/group_zero_std_frac": 0.18055555373430252, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.036294857412576674, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.036294857412576674, "signal/confidence_one_or_zero/centered_abs_mean": 0.007633463479578495, "signal/confidence_one_or_zero/group_std_mean": 0.01382398819550872, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9444444656372071, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 7.6334632126418e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 7.6334632126418e-08, "signal/format_reward/centered_abs_mean": 0.010112847341224552, "signal/format_reward/group_std_mean": 0.01711827255785465, "signal/format_reward/group_zero_std_frac": 0.9333333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005056423670612276, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005056423670612276, "signal/mean_confidence_reward/centered_abs_mean": 0.05507693514227867, "signal/mean_confidence_reward/group_std_mean": 0.07473518252372742, "signal/mean_confidence_reward/group_zero_std_frac": 0.20277778208255767, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.507693458639551e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.507693458639551e-07, "step": 980 }, { "calibration/aurc": 0.09219226959477671, "calibration/batch_distribution_entropy": 0.4775696969320794, "calibration/confidence_entropy": 0.42146092064811214, "calibration/coverage@0%": 0.00944502099766539, "calibration/coverage@1%": 0.00944502099766539, "calibration/coverage@10%": 0.6558294854881266, "calibration/coverage@15%": 0.7085298482849604, "calibration/coverage@20%": 0.9023746701846965, "calibration/coverage@25%": 0.9387862796833772, "calibration/coverage@30%": 0.9725593667546175, "calibration/coverage@5%": 0.561711601843282, "calibration/ece": 0.12220476421947153, "calibration/mean_confidence": 0.7846327603008285, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005902777777777768, "completions/max_length": 4043.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 1113.4710205078125, "completions/mean_terminated_length": 1119.9992431640626, "completions/min_length": 0.0, "completions/min_terminated_length": 254.4, "epoch": 2.3671954100573744, "grad_norm": 0.0004768665530718863, "learning_rate": 3.305288461538462e-07, "loss": -0.0091, "num_tokens": 2659304235.0, "reward": 1.3135586977005005, "reward_std": 0.10560905784368516, "rewards/accuracy_reward": 0.7682291626930237, "rewards/brier_reward": 0.8647754192352295, "rewards/confidence_one_or_zero": 0.004947916674427688, "rewards/format_reward": 0.9940972328186035, "rewards/mean_confidence_reward": 0.773299491405487, "signal/accuracy_reward/centered_abs_mean": 0.09091796875, "signal/accuracy_reward/group_std_mean": 0.12922247350215912, "signal/accuracy_reward/group_zero_std_frac": 0.6027777910232544, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.045458984375, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.045458984375, "signal/advantage_abs_mean": 0.0726355865597725, "signal/advantage_pre_scale_abs_mean": 0.0726355865597725, "signal/advantage_pre_scale_std": 0.16614577174186707, "signal/advantage_std": 0.16614577174186707, "signal/brier_reward/centered_abs_mean": 0.06809832900762558, "signal/brier_reward/group_std_mean": 0.09460703134536744, "signal/brier_reward/group_zero_std_frac": 0.2166666716337204, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03404916450381279, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03404916450381279, "signal/confidence_one_or_zero/centered_abs_mean": 0.007101779524236918, "signal/confidence_one_or_zero/group_std_mean": 0.011190783884376287, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9583333373069763, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 7.101779360141336e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 7.101779360141336e-08, "signal/format_reward/centered_abs_mean": 0.010243055410683155, "signal/format_reward/group_std_mean": 0.018895946256816388, "signal/format_reward/group_zero_std_frac": 0.925000011920929, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0051215277053415775, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0051215277053415775, "signal/mean_confidence_reward/centered_abs_mean": 0.05528104305267334, "signal/mean_confidence_reward/group_std_mean": 0.0736970141530037, "signal/mean_confidence_reward/group_zero_std_frac": 0.23333333134651185, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.528104111363063e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.528104111363063e-07, "step": 985 }, { "calibration/aurc": 0.10816224432780455, "calibration/batch_distribution_entropy": 0.44206927784194117, "calibration/confidence_entropy": 0.4026672884882415, "calibration/coverage@0%": 0.022935726320178255, "calibration/coverage@1%": 0.3149192445969407, "calibration/coverage@10%": 0.4634494085455597, "calibration/coverage@15%": 0.7204573234636082, "calibration/coverage@20%": 0.8341025259843524, "calibration/coverage@25%": 0.9086715798850811, "calibration/coverage@30%": 0.97236727589208, "calibration/coverage@5%": 0.4363660752122264, "calibration/ece": 0.12028688573378161, "calibration/mean_confidence": 0.8060216912669338, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005729166666666674, "completions/max_length": 4042.8, "completions/max_terminated_length": 4042.8, "completions/mean_length": 1098.6085083007813, "completions/mean_terminated_length": 1104.9492309570312, "completions/min_length": 0.0, "completions/min_terminated_length": 247.6, "epoch": 2.3791952600592494, "grad_norm": 0.0004494332242757082, "learning_rate": 3.0048076923076924e-07, "loss": -0.0063, "num_tokens": 2675040749.0, "reward": 1.3173902750015258, "reward_std": 0.10204194486141205, "rewards/accuracy_reward": 0.7671006917953491, "rewards/brier_reward": 0.8733930826187134, "rewards/confidence_one_or_zero": 0.004600694589316845, "rewards/format_reward": 0.9942708253860474, "rewards/mean_confidence_reward": 0.7852690815925598, "signal/accuracy_reward/centered_abs_mean": 0.0937228724360466, "signal/accuracy_reward/group_std_mean": 0.1246465116739273, "signal/accuracy_reward/group_zero_std_frac": 0.6388888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0468614362180233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0468614362180233, "signal/advantage_abs_mean": 0.07466076761484146, "signal/advantage_pre_scale_abs_mean": 0.07466076761484146, "signal/advantage_pre_scale_std": 0.164698725938797, "signal/advantage_std": 0.164698725938797, "signal/brier_reward/centered_abs_mean": 0.06477420330047608, "signal/brier_reward/group_std_mean": 0.08835353553295136, "signal/brier_reward/group_zero_std_frac": 0.2361111134290695, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03238710165023804, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03238710165023804, "signal/confidence_one_or_zero/centered_abs_mean": 0.006722005270421505, "signal/confidence_one_or_zero/group_std_mean": 0.012391660921275615, "signal/confidence_one_or_zero/group_zero_std_frac": 0.95, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.722004997072872e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.722004997072872e-08, "signal/format_reward/centered_abs_mean": 0.009006076212972402, "signal/format_reward/group_std_mean": 0.016139536164700985, "signal/format_reward/group_zero_std_frac": 0.9361111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004503038106486201, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004503038106486201, "signal/mean_confidence_reward/centered_abs_mean": 0.051574555784463884, "signal/mean_confidence_reward/group_std_mean": 0.06871222704648972, "signal/mean_confidence_reward/group_zero_std_frac": 0.2666666656732559, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.157455518656206e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.157455518656206e-07, "step": 990 }, { "calibration/aurc": 0.06498000403743344, "calibration/batch_distribution_entropy": 0.4504352013862027, "calibration/confidence_entropy": 0.4068750149487831, "calibration/coverage@0%": 0.13777479866105583, "calibration/coverage@1%": 0.3807261875499447, "calibration/coverage@10%": 0.6487281819417158, "calibration/coverage@15%": 0.8466206313618022, "calibration/coverage@20%": 0.9264441497200118, "calibration/coverage@25%": 0.9386663719422341, "calibration/coverage@30%": 0.9458885941644561, "calibration/coverage@5%": 0.6122614996057899, "calibration/ece": 0.10399259737979583, "calibration/mean_confidence": 0.7663507755654584, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008246527777777768, "completions/max_length": 4069.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 1071.7584228515625, "completions/mean_terminated_length": 1080.6629638671875, "completions/min_length": 0.0, "completions/min_terminated_length": 225.2, "epoch": 2.3911951100611244, "grad_norm": 0.0006326537695713341, "learning_rate": 2.7043269230769233e-07, "loss": -0.0088, "num_tokens": 2690484654.0, "reward": 1.3288020372390748, "reward_std": 0.10247245877981186, "rewards/accuracy_reward": 0.7879340291023255, "rewards/brier_reward": 0.8779008269309998, "rewards/confidence_one_or_zero": 0.004600694461259991, "rewards/format_reward": 0.9917534708976745, "rewards/mean_confidence_reward": 0.7859972238540649, "signal/accuracy_reward/centered_abs_mean": 0.09366862028837204, "signal/accuracy_reward/group_std_mean": 0.1289836958050728, "signal/accuracy_reward/group_zero_std_frac": 0.6083333492279053, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04683431014418602, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04683431014418602, "signal/advantage_abs_mean": 0.0721493236720562, "signal/advantage_pre_scale_abs_mean": 0.0721493236720562, "signal/advantage_pre_scale_std": 0.16600977778434753, "signal/advantage_std": 0.16600977778434753, "signal/brier_reward/centered_abs_mean": 0.0602824330329895, "signal/brier_reward/group_std_mean": 0.08540496081113816, "signal/brier_reward/group_zero_std_frac": 0.21388889253139495, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03014121651649475, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03014121651649475, "signal/confidence_one_or_zero/centered_abs_mean": 0.007687716861255467, "signal/confidence_one_or_zero/group_std_mean": 0.013945099478587509, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9444444417953491, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 7.68771684533931e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 7.68771684533931e-08, "signal/format_reward/centered_abs_mean": 0.010725911613553763, "signal/format_reward/group_std_mean": 0.018312014266848566, "signal/format_reward/group_zero_std_frac": 0.9305555820465088, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005362955806776881, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005362955806776881, "signal/mean_confidence_reward/centered_abs_mean": 0.04925224408507347, "signal/mean_confidence_reward/group_std_mean": 0.06722581312060356, "signal/mean_confidence_reward/group_zero_std_frac": 0.23611111044883729, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 4.925224175167386e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 4.925224175167386e-07, "step": 995 }, { "calibration/aurc": 0.07392916398070537, "calibration/batch_distribution_entropy": 0.4764187583325599, "calibration/confidence_entropy": 0.41146070738884477, "calibration/coverage@0%": 0.009486972945947447, "calibration/coverage@1%": 0.16092300427754014, "calibration/coverage@10%": 0.8021344863092356, "calibration/coverage@15%": 0.8667249253068787, "calibration/coverage@20%": 0.8928878988007887, "calibration/coverage@25%": 0.9686684073107049, "calibration/coverage@30%": 0.9885117493472585, "calibration/coverage@5%": 0.5535842951269097, "calibration/ece": 0.10532341602535719, "calibration/mean_confidence": 0.7816812220708037, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003732638888888884, "completions/max_length": 3951.4, "completions/max_terminated_length": 3951.4, "completions/mean_length": 1114.0869140625, "completions/mean_terminated_length": 1118.4550537109376, "completions/min_length": 0.0, "completions/min_terminated_length": 248.0, "epoch": 2.4031949600629994, "grad_norm": 0.0005221031024120748, "learning_rate": 2.403846153846154e-07, "loss": -0.0021, "num_tokens": 2706428183.0, "reward": 1.3253002166748047, "reward_std": 0.10136813819408416, "rewards/accuracy_reward": 0.7802951335906982, "rewards/brier_reward": 0.8741090655326843, "rewards/confidence_one_or_zero": 0.006076388922519982, "rewards/format_reward": 0.9961805582046509, "rewards/mean_confidence_reward": 0.7722291588783264, "signal/accuracy_reward/centered_abs_mean": 0.1008517786860466, "signal/accuracy_reward/group_std_mean": 0.13480582237243652, "signal/accuracy_reward/group_zero_std_frac": 0.6138889074325562, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0504258893430233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0504258893430233, "signal/advantage_abs_mean": 0.07374473214149475, "signal/advantage_pre_scale_abs_mean": 0.07374473214149475, "signal/advantage_pre_scale_std": 0.16322429180145265, "signal/advantage_std": 0.16322429180145265, "signal/brier_reward/centered_abs_mean": 0.06756164729595185, "signal/brier_reward/group_std_mean": 0.09020474702119827, "signal/brier_reward/group_zero_std_frac": 0.21666666567325593, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.033780823647975924, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.033780823647975924, "signal/confidence_one_or_zero/centered_abs_mean": 0.007921006996184587, "signal/confidence_one_or_zero/group_std_mean": 0.012272831983864307, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9555555582046509, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 7.921006641709028e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 7.921006641709028e-08, "signal/format_reward/centered_abs_mean": 0.006412760389503091, "signal/format_reward/group_std_mean": 0.011931251361966133, "signal/format_reward/group_zero_std_frac": 0.95, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0032063801947515456, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0032063801947515456, "signal/mean_confidence_reward/centered_abs_mean": 0.05102816671133041, "signal/mean_confidence_reward/group_std_mean": 0.06970654726028443, "signal/mean_confidence_reward/group_zero_std_frac": 0.23888889253139495, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.102816317048564e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.102816317048564e-07, "step": 1000 }, { "epoch": 2.4031949600629994, "eval_calibration/aurc": 0.11307764964594312, "eval_calibration/batch_distribution_entropy": 0.45459900138155657, "eval_calibration/confidence_entropy": 0.40907026477251535, "eval_calibration/coverage@0%": 0.11979166666666667, "eval_calibration/coverage@1%": 0.11979166666666667, "eval_calibration/coverage@10%": 0.6145833333333334, "eval_calibration/coverage@15%": 0.703125, "eval_calibration/coverage@20%": 0.8802083333333334, "eval_calibration/coverage@25%": 0.9375, "eval_calibration/coverage@30%": 0.9791666666666666, "eval_calibration/coverage@5%": 0.3958333333333333, "eval_calibration/ece": 0.11718750000000006, "eval_calibration/mean_confidence": 0.7713541666666668, "eval_completions/clipped_ratio": 0.0026041666666666665, "eval_completions/max_length": 3572.0, "eval_completions/max_terminated_length": 3572.0, "eval_completions/mean_length": 1106.7180989583333, "eval_completions/mean_terminated_length": 1109.6278279622395, "eval_completions/min_length": 181.5, "eval_completions/min_terminated_length": 279.5, "eval_loss": 0.0, "eval_num_tokens": 2706428183.0, "eval_reward": 1.2929895122845967, "eval_reward_std": 0.3166048725446065, "eval_rewards/accuracy_reward": 0.7317708233992258, "eval_rewards/brier_reward": 0.8567968904972076, "eval_rewards/confidence_one_or_zero": 0.004340277907128136, "eval_rewards/format_reward": 0.9973958333333334, "eval_rewards/mean_confidence_reward": 0.7687499721844991, "eval_runtime": 201.7277, "eval_samples_per_second": 4.957, "eval_signal/accuracy_reward/centered_abs_mean": 0.3802625884612401, "eval_signal/accuracy_reward/group_std_mean": 0.44101160764694214, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.19013129423062006, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.19013129423062006, "eval_signal/advantage_abs_mean": 0.26487501462300617, "eval_signal/advantage_pre_scale_abs_mean": 0.26487501462300617, "eval_signal/advantage_pre_scale_std": 0.31484073400497437, "eval_signal/advantage_std": 0.31484073400497437, "eval_signal/brier_reward/centered_abs_mean": 0.17813993245363235, "eval_signal/brier_reward/group_std_mean": 0.23801814019680023, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08906996622681618, "eval_signal/brier_reward/weight": 0.5, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.08906996622681618, "eval_signal/confidence_one_or_zero/centered_abs_mean": 0.008409287935743729, "eval_signal/confidence_one_or_zero/group_std_mean": 0.02455231888840596, "eval_signal/confidence_one_or_zero/group_zero_std_frac": 0.8611111342906952, "eval_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 8.409287488575501e-08, "eval_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "eval_signal/confidence_one_or_zero/weighted_centered_abs_mean": 8.409287488575501e-08, "eval_signal/format_reward/centered_abs_mean": 0.0050455727614462376, "eval_signal/format_reward/group_std_mean": 0.014731391333043575, "eval_signal/format_reward/group_zero_std_frac": 0.9166666766007742, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0025227863807231188, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.0025227863807231188, "eval_signal/mean_confidence_reward/centered_abs_mean": 0.1713812674085299, "eval_signal/mean_confidence_reward/group_std_mean": 0.21960687388976416, "eval_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "eval_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.7138126319575047e-06, "eval_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "eval_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.7138126319575047e-06, "eval_steps_per_second": 0.03, "step": 1000 }, { "epoch": 2.4031949600629994, "step": 1000, "train_probe_calibration/aurc": 0.11347730893319892, "train_probe_calibration/batch_distribution_entropy": 0.4812146322955091, "train_probe_calibration/confidence_entropy": 0.4153594980424879, "train_probe_calibration/coverage@0%": 0.02638888888888889, "train_probe_calibration/coverage@1%": 0.02638888888888889, "train_probe_calibration/coverage@10%": 0.6065972222222222, "train_probe_calibration/coverage@15%": 0.8364583333333333, "train_probe_calibration/coverage@20%": 0.9461805555555555, "train_probe_calibration/coverage@25%": 0.9722222222222222, "train_probe_calibration/coverage@30%": 1.0, "train_probe_calibration/coverage@5%": 0.12534722222222222, "train_probe_calibration/ece": 0.10368055555555566, "train_probe_calibration/mean_confidence": 0.7641319444444444, "train_probe_completions/clipped_ratio": 0.012673611111111113, "train_probe_completions/max_length": 3793.8333333333335, "train_probe_completions/max_terminated_length": 3793.8333333333335, "train_probe_completions/mean_length": 1091.1081949869792, "train_probe_completions/mean_terminated_length": 1105.3033854166667, "train_probe_completions/min_length": 87.33333333333333, "train_probe_completions/min_terminated_length": 257.8333333333333, "train_probe_loss": 0.0, "train_probe_num_tokens": 2706428183.0, "train_probe_reward": 1.3265713055928547, "train_probe_reward_std": 0.2984622319539388, "train_probe_rewards/accuracy_reward": 0.7855902711550394, "train_probe_rewards/brier_reward": 0.8744812607765198, "train_probe_rewards/confidence_one_or_zero": 0.009548611240461469, "train_probe_rewards/format_reward": 0.9930555522441864, "train_probe_rewards/mean_confidence_reward": 0.7690451343854269, "train_probe_runtime": 210.6752, "train_probe_samples_per_second": 4.747, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3287217915058136, "train_probe_signal/accuracy_reward/group_std_mean": 0.40934642652670544, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.1643608957529068, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.1643608957529068, "train_probe_signal/advantage_abs_mean": 0.2322815457979838, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2322815457979838, "train_probe_signal/advantage_pre_scale_std": 0.2985395093758901, "train_probe_signal/advantage_std": 0.2985395093758901, "train_probe_signal/brier_reward/centered_abs_mean": 0.15897609541813532, "train_probe_signal/brier_reward/group_std_mean": 0.22392593075831732, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07948804770906766, "train_probe_signal/brier_reward/weight": 0.5, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.07948804770906766, "train_probe_signal/confidence_one_or_zero/centered_abs_mean": 0.018283420087148745, "train_probe_signal/confidence_one_or_zero/group_std_mean": 0.048036283192535244, "train_probe_signal/confidence_one_or_zero/group_zero_std_frac": 0.750000019868215, "train_probe_signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.828341898620541e-07, "train_probe_signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "train_probe_signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.828341898620541e-07, "train_probe_signal/format_reward/centered_abs_mean": 0.013346354011446238, "train_probe_signal/format_reward/group_std_mean": 0.0362943010404706, "train_probe_signal/format_reward/group_zero_std_frac": 0.8055555721124014, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.006673177005723119, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.006673177005723119, "train_probe_signal/mean_confidence_reward/centered_abs_mean": 0.17747067660093307, "train_probe_signal/mean_confidence_reward/group_std_mean": 0.22761460890372595, "train_probe_signal/mean_confidence_reward/group_zero_std_frac": 0.0, "train_probe_signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 1.7747067128463339e-06, "train_probe_signal/mean_confidence_reward/weight": 9.999999747378752e-06, "train_probe_signal/mean_confidence_reward/weighted_centered_abs_mean": 1.7747067128463339e-06, "train_probe_steps_per_second": 0.028 }, { "calibration/aurc": 0.12772337485405671, "calibration/batch_distribution_entropy": 0.5078687370093968, "calibration/confidence_entropy": 0.42604049788968457, "calibration/coverage@0%": 0.16315909387287683, "calibration/coverage@1%": 0.2642172949310779, "calibration/coverage@10%": 0.42880218811266924, "calibration/coverage@15%": 0.4816116308100836, "calibration/coverage@20%": 0.6695970555879612, "calibration/coverage@25%": 0.8960411386049476, "calibration/coverage@30%": 0.9340783331948254, "calibration/coverage@5%": 0.41466528335076447, "calibration/ece": 0.11922840452154111, "calibration/mean_confidence": 0.7539156265255479, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008940972222222232, "completions/max_length": 4017.4, "completions/max_terminated_length": 4017.4, "completions/mean_length": 1184.4095703125, "completions/mean_terminated_length": 1195.205322265625, "completions/min_length": 0.0, "completions/min_terminated_length": 236.4, "epoch": 2.4151948100648744, "grad_norm": 0.000479711132356897, "learning_rate": 2.103365384615385e-07, "loss": -0.0119, "num_tokens": 2723197797.0, "reward": 1.3158220767974853, "reward_std": 0.11039352715015412, "rewards/accuracy_reward": 0.759375, "rewards/brier_reward": 0.8810216665267945, "rewards/confidence_one_or_zero": 0.0038194443855900317, "rewards/format_reward": 0.9912326455116272, "rewards/mean_confidence_reward": 0.7422474026679993, "signal/accuracy_reward/centered_abs_mean": 0.09804687649011612, "signal/accuracy_reward/group_std_mean": 0.13862352967262268, "signal/accuracy_reward/group_zero_std_frac": 0.575000011920929, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04902343824505806, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04902343824505806, "signal/advantage_abs_mean": 0.07680359631776809, "signal/advantage_pre_scale_abs_mean": 0.07680359631776809, "signal/advantage_pre_scale_std": 0.17023244500160217, "signal/advantage_std": 0.17023244500160217, "signal/brier_reward/centered_abs_mean": 0.07126273363828659, "signal/brier_reward/group_std_mean": 0.09792182445526124, "signal/brier_reward/group_zero_std_frac": 0.1833333343267441, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03563136681914329, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03563136681914329, "signal/confidence_one_or_zero/centered_abs_mean": 0.0049696180853061375, "signal/confidence_one_or_zero/group_std_mean": 0.00806911545805633, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9694444417953492, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.9696179260649843e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.9696179260649843e-08, "signal/format_reward/centered_abs_mean": 0.014360894076526165, "signal/format_reward/group_std_mean": 0.025504958257079123, "signal/format_reward/group_zero_std_frac": 0.9000000119209289, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.007180447038263082, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007180447038263082, "signal/mean_confidence_reward/centered_abs_mean": 0.06406570747494697, "signal/mean_confidence_reward/group_std_mean": 0.08374290466308594, "signal/mean_confidence_reward/group_zero_std_frac": 0.1916666656732559, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 6.406570719263982e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 6.406570719263982e-07, "step": 1005 }, { "calibration/aurc": 0.074684769449917, "calibration/batch_distribution_entropy": 0.5397490149217888, "calibration/confidence_entropy": 0.4277009252155712, "calibration/coverage@0%": 0.046453587287374865, "calibration/coverage@1%": 0.15999525395404152, "calibration/coverage@10%": 0.7299631351983742, "calibration/coverage@15%": 0.8410615208877286, "calibration/coverage@20%": 0.9182291666666668, "calibration/coverage@25%": 0.9838541666666668, "calibration/coverage@30%": 0.9953125, "calibration/coverage@5%": 0.4770647934238286, "calibration/ece": 0.11489781996580213, "calibration/mean_confidence": 0.7607692614993008, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006684027777777768, "completions/max_length": 4002.0, "completions/max_terminated_length": 4002.0, "completions/mean_length": 1133.229345703125, "completions/mean_terminated_length": 1140.8066162109376, "completions/min_length": 0.0, "completions/min_terminated_length": 274.6, "epoch": 2.4271946600667493, "grad_norm": 0.0004865651426371187, "learning_rate": 1.8028846153846153e-07, "loss": -0.0084, "num_tokens": 2739376375.0, "reward": 1.2982456922531127, "reward_std": 0.1104705885052681, "rewards/accuracy_reward": 0.7449652671813964, "rewards/brier_reward": 0.8581950902938843, "rewards/confidence_one_or_zero": 0.005642361263744533, "rewards/format_reward": 0.9933159708976745, "rewards/mean_confidence_reward": 0.7525590300559998, "signal/accuracy_reward/centered_abs_mean": 0.1011935755610466, "signal/accuracy_reward/group_std_mean": 0.14138245284557344, "signal/accuracy_reward/group_zero_std_frac": 0.569444453716278, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0505967877805233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0505967877805233, "signal/advantage_abs_mean": 0.0779399573802948, "signal/advantage_pre_scale_abs_mean": 0.0779399573802948, "signal/advantage_pre_scale_std": 0.16687268614768982, "signal/advantage_std": 0.16687268614768982, "signal/brier_reward/centered_abs_mean": 0.07169617861509323, "signal/brier_reward/group_std_mean": 0.09820040464401245, "signal/brier_reward/group_zero_std_frac": 0.17500000298023224, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03584808930754661, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03584808930754661, "signal/confidence_one_or_zero/centered_abs_mean": 0.006754557369276881, "signal/confidence_one_or_zero/group_std_mean": 0.009212856367230415, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9722222208976745, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.754557162480523e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.754557162480523e-08, "signal/format_reward/centered_abs_mean": 0.011073133535683155, "signal/format_reward/group_std_mean": 0.019407318718731404, "signal/format_reward/group_zero_std_frac": 0.925000011920929, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005536566767841577, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005536566767841577, "signal/mean_confidence_reward/centered_abs_mean": 0.05948562324047089, "signal/mean_confidence_reward/group_std_mean": 0.08126413226127624, "signal/mean_confidence_reward/group_zero_std_frac": 0.18611111044883727, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.948562261437474e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.948562261437474e-07, "step": 1010 }, { "calibration/aurc": 0.06345862889847641, "calibration/batch_distribution_entropy": 0.42225252492058923, "calibration/confidence_entropy": 0.4098343730018147, "calibration/coverage@0%": 0.21580285204991084, "calibration/coverage@1%": 0.5267403520499109, "calibration/coverage@10%": 0.7755886809269162, "calibration/coverage@15%": 0.8087386809269163, "calibration/coverage@20%": 0.8397547237076649, "calibration/coverage@25%": 0.9456, "calibration/coverage@30%": 0.9536, "calibration/coverage@5%": 0.7656928475935828, "calibration/ece": 0.1519316891711228, "calibration/mean_confidence": 0.7710957172459894, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005121527777777768, "completions/max_length": 4021.6, "completions/max_terminated_length": 4021.6, "completions/mean_length": 1076.5626953125, "completions/mean_terminated_length": 1082.214013671875, "completions/min_length": 0.0, "completions/min_terminated_length": 251.8, "epoch": 2.4391945100686243, "grad_norm": 0.0005801943480037153, "learning_rate": 1.5024038461538462e-07, "loss": -0.005, "num_tokens": 2754857737.0, "reward": 1.3297703266143799, "reward_std": 0.10483562052249909, "rewards/accuracy_reward": 0.7934027791023255, "rewards/brier_reward": 0.8712435960769653, "rewards/confidence_one_or_zero": 0.0052083334478084, "rewards/format_reward": 0.9948784708976746, "rewards/mean_confidence_reward": 0.7831310749053955, "signal/accuracy_reward/centered_abs_mean": 0.0933810755610466, "signal/accuracy_reward/group_std_mean": 0.12864405959844588, "signal/accuracy_reward/group_zero_std_frac": 0.6138888835906983, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0466905377805233, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0466905377805233, "signal/advantage_abs_mean": 0.0742639109492302, "signal/advantage_pre_scale_abs_mean": 0.0742639109492302, "signal/advantage_pre_scale_std": 0.16281065642833709, "signal/advantage_std": 0.16281065642833709, "signal/brier_reward/centered_abs_mean": 0.06942715719342232, "signal/brier_reward/group_std_mean": 0.09444344490766525, "signal/brier_reward/group_zero_std_frac": 0.2027777761220932, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03471357859671116, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03471357859671116, "signal/confidence_one_or_zero/centered_abs_mean": 0.00756293412996456, "signal/confidence_one_or_zero/group_std_mean": 0.01193662453442812, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9583333492279053, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 7.56293410120179e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 7.56293410120179e-08, "signal/format_reward/centered_abs_mean": 0.008490668633021415, "signal/format_reward/group_std_mean": 0.015459578996524215, "signal/format_reward/group_zero_std_frac": 0.9361111283302307, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004245334316510707, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004245334316510707, "signal/mean_confidence_reward/centered_abs_mean": 0.05429552644491196, "signal/mean_confidence_reward/group_std_mean": 0.0731324501335621, "signal/mean_confidence_reward/group_zero_std_frac": 0.24166666865348815, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.429552402347326e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.429552402347326e-07, "step": 1015 }, { "calibration/aurc": 0.14245162522594357, "calibration/batch_distribution_entropy": 0.4354837970914881, "calibration/confidence_entropy": 0.40429451714756154, "calibration/coverage@0%": 0.03184318527579614, "calibration/coverage@1%": 0.03184318527579614, "calibration/coverage@10%": 0.2078916513332374, "calibration/coverage@15%": 0.38749619876578745, "calibration/coverage@20%": 0.8565399431549515, "calibration/coverage@25%": 0.9309465223097113, "calibration/coverage@30%": 0.9367208005249343, "calibration/coverage@5%": 0.1844473519424628, "calibration/ece": 0.07660766540829758, "calibration/mean_confidence": 0.792147074034765, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 4052.2, "completions/max_terminated_length": 4052.2, "completions/mean_length": 1083.0081787109375, "completions/mean_terminated_length": 1089.6731689453125, "completions/min_length": 0.0, "completions/min_terminated_length": 234.2, "epoch": 2.4511943600704993, "grad_norm": 0.0005132149672135711, "learning_rate": 1.201923076923077e-07, "loss": -0.01, "num_tokens": 2770426023.0, "reward": 1.3075095176696778, "reward_std": 0.09387443959712982, "rewards/accuracy_reward": 0.7511284828186036, "rewards/brier_reward": 0.8701250433921814, "rewards/confidence_one_or_zero": 0.009548611112404615, "rewards/format_reward": 0.99375, "rewards/mean_confidence_reward": 0.7681614279747009, "signal/accuracy_reward/centered_abs_mean": 0.0928005650639534, "signal/accuracy_reward/group_std_mean": 0.1202504426240921, "signal/accuracy_reward/group_zero_std_frac": 0.6638888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.0464002825319767, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0464002825319767, "signal/advantage_abs_mean": 0.06897872760891914, "signal/advantage_pre_scale_abs_mean": 0.06897872760891914, "signal/advantage_pre_scale_std": 0.1561448872089386, "signal/advantage_std": 0.1561448872089386, "signal/brier_reward/centered_abs_mean": 0.06328051909804344, "signal/brier_reward/group_std_mean": 0.08385396748781204, "signal/brier_reward/group_zero_std_frac": 0.2694444477558136, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03164025954902172, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03164025954902172, "signal/confidence_one_or_zero/centered_abs_mean": 0.010633680620230734, "signal/confidence_one_or_zero/group_std_mean": 0.015721834963187575, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9472222328186035, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 1.063368006981591e-07, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 1.063368006981591e-07, "signal/format_reward/centered_abs_mean": 0.01015625, "signal/format_reward/group_std_mean": 0.01880386732518673, "signal/format_reward/group_zero_std_frac": 0.925000011920929, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.005078125, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005078125, "signal/mean_confidence_reward/centered_abs_mean": 0.051573795825243, "signal/mean_confidence_reward/group_std_mean": 0.06898580715060235, "signal/mean_confidence_reward/group_zero_std_frac": 0.2805555582046509, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.157379632692027e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.157379632692027e-07, "step": 1020 }, { "calibration/aurc": 0.08085045625612934, "calibration/batch_distribution_entropy": 0.49563165742965465, "calibration/confidence_entropy": 0.424957605240448, "calibration/coverage@0%": 0.15056444911122352, "calibration/coverage@1%": 0.15056444911122352, "calibration/coverage@10%": 0.6355874274921511, "calibration/coverage@15%": 0.797734257879604, "calibration/coverage@20%": 0.8670759899912968, "calibration/coverage@25%": 0.9885117493472585, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.5656227786283543, "calibration/ece": 0.11574308306164238, "calibration/mean_confidence": 0.7491831816392811, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006510416666666652, "completions/max_length": 4007.6, "completions/max_terminated_length": 4007.6, "completions/mean_length": 1102.3575927734375, "completions/mean_terminated_length": 1109.7432373046875, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 2.4631942100723743, "grad_norm": 0.0005867654108442366, "learning_rate": 9.014423076923076e-08, "loss": -0.0065, "num_tokens": 2786231742.0, "reward": 1.3031750202178956, "reward_std": 0.09940682649612427, "rewards/accuracy_reward": 0.7552951335906982, "rewards/brier_reward": 0.857550048828125, "rewards/confidence_one_or_zero": 0.0029513889108784496, "rewards/format_reward": 0.9934895753860473, "rewards/mean_confidence_reward": 0.7570624828338623, "signal/accuracy_reward/centered_abs_mean": 0.09311523288488388, "signal/accuracy_reward/group_std_mean": 0.12970585078001023, "signal/accuracy_reward/group_zero_std_frac": 0.6027777910232544, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04655761644244194, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04655761644244194, "signal/advantage_abs_mean": 0.07214210256934166, "signal/advantage_pre_scale_abs_mean": 0.07214210256934166, "signal/advantage_pre_scale_std": 0.1583523139357567, "signal/advantage_std": 0.1583523139357567, "signal/brier_reward/centered_abs_mean": 0.06650192737579345, "signal/brier_reward/group_std_mean": 0.08938954323530197, "signal/brier_reward/group_zero_std_frac": 0.21944444477558137, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03325096368789673, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03325096368789673, "signal/confidence_one_or_zero/centered_abs_mean": 0.004513888969086111, "signal/confidence_one_or_zero/group_std_mean": 0.008359964191913604, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9638888835906982, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 4.5138886406448364e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 4.5138886406448364e-08, "signal/format_reward/centered_abs_mean": 0.008913845452480019, "signal/format_reward/group_std_mean": 0.01367659866809845, "signal/format_reward/group_zero_std_frac": 0.95, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004456922726240009, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004456922726240009, "signal/mean_confidence_reward/centered_abs_mean": 0.05395682230591774, "signal/mean_confidence_reward/group_std_mean": 0.07205559611320496, "signal/mean_confidence_reward/group_zero_std_frac": 0.2388888865709305, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.395682137532276e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.395682137532276e-07, "step": 1025 }, { "calibration/aurc": 0.08105710733255718, "calibration/batch_distribution_entropy": 0.5064012145346954, "calibration/confidence_entropy": 0.4203946947410608, "calibration/coverage@0%": 0.029434440396734228, "calibration/coverage@1%": 0.14245527373006756, "calibration/coverage@10%": 0.7455349332807952, "calibration/coverage@15%": 0.7975526512080114, "calibration/coverage@20%": 0.834679723031216, "calibration/coverage@25%": 0.9446665578764144, "calibration/coverage@30%": 0.9592689295039165, "calibration/coverage@5%": 0.571370232051285, "calibration/ece": 0.11286531232371791, "calibration/mean_confidence": 0.7533351790956763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006857638888888906, "completions/max_length": 4064.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 1143.324658203125, "completions/mean_terminated_length": 1151.3432373046876, "completions/min_length": 0.0, "completions/min_terminated_length": 260.4, "epoch": 2.4751940600742492, "grad_norm": 0.00044234259985387325, "learning_rate": 6.009615384615386e-08, "loss": -0.0096, "num_tokens": 2802498938.0, "reward": 1.2955131769180297, "reward_std": 0.09872912913560868, "rewards/accuracy_reward": 0.7381076335906982, "rewards/brier_reward": 0.8597612023353577, "rewards/confidence_one_or_zero": 0.0031250000814907254, "rewards/format_reward": 0.9931423664093018, "rewards/mean_confidence_reward": 0.7495512366294861, "signal/accuracy_reward/centered_abs_mean": 0.08402235209941863, "signal/accuracy_reward/group_std_mean": 0.11890482306480407, "signal/accuracy_reward/group_zero_std_frac": 0.625, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04201117604970932, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04201117604970932, "signal/advantage_abs_mean": 0.0684260293841362, "signal/advantage_pre_scale_abs_mean": 0.0684260293841362, "signal/advantage_pre_scale_std": 0.15545083284378053, "signal/advantage_std": 0.15545083284378053, "signal/brier_reward/centered_abs_mean": 0.06586636751890182, "signal/brier_reward/group_std_mean": 0.09170816987752914, "signal/brier_reward/group_zero_std_frac": 0.20277778208255767, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03293318375945091, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03293318375945091, "signal/confidence_one_or_zero/centered_abs_mean": 0.0054144964320585135, "signal/confidence_one_or_zero/group_std_mean": 0.01076931986026466, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9527777671813965, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 5.41449651336734e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 5.41449651336734e-08, "signal/format_reward/centered_abs_mean": 0.009933810774236917, "signal/format_reward/group_std_mean": 0.01772972457110882, "signal/format_reward/group_zero_std_frac": 0.9305555820465088, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.004966905387118459, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.004966905387118459, "signal/mean_confidence_reward/centered_abs_mean": 0.05369309484958649, "signal/mean_confidence_reward/group_std_mean": 0.07315653115510941, "signal/mean_confidence_reward/group_zero_std_frac": 0.2194444417953491, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.369309292291291e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.369309292291291e-07, "step": 1030 }, { "calibration/aurc": 0.07310658250536434, "calibration/batch_distribution_entropy": 0.5767640449727093, "calibration/confidence_entropy": 0.43607185872289167, "calibration/coverage@0%": 0.025680774278215225, "calibration/coverage@1%": 0.15536827427821523, "calibration/coverage@10%": 0.7524757396248877, "calibration/coverage@15%": 0.819786793270536, "calibration/coverage@20%": 0.8809467181158059, "calibration/coverage@25%": 0.9068077427821523, "calibration/coverage@30%": 0.961708497375328, "calibration/coverage@5%": 0.6806316661097072, "calibration/ece": 0.1281238583416876, "calibration/mean_confidence": 0.710103208848521, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00347222222222221, "completions/max_length": 4027.4, "completions/max_terminated_length": 4027.4, "completions/mean_length": 1141.57646484375, "completions/mean_terminated_length": 1145.533447265625, "completions/min_length": 0.0, "completions/min_terminated_length": 239.4, "epoch": 2.4871939100761242, "grad_norm": 0.000561815220862627, "learning_rate": 3.004807692307693e-08, "loss": -0.0018, "num_tokens": 2818720907.0, "reward": 1.3272055387496948, "reward_std": 0.09913517981767654, "rewards/accuracy_reward": 0.7771701335906982, "rewards/brier_reward": 0.8806982159614563, "rewards/confidence_one_or_zero": 0.004079861147329211, "rewards/format_reward": 0.9965277791023255, "rewards/mean_confidence_reward": 0.7443830847740174, "signal/accuracy_reward/centered_abs_mean": 0.09791124165058136, "signal/accuracy_reward/group_std_mean": 0.13428298532962799, "signal/accuracy_reward/group_zero_std_frac": 0.5972222328186035, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04895562082529068, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04895562082529068, "signal/advantage_abs_mean": 0.06943440213799476, "signal/advantage_pre_scale_abs_mean": 0.06943440213799476, "signal/advantage_pre_scale_std": 0.1531498283147812, "signal/advantage_std": 0.1531498283147812, "signal/brier_reward/centered_abs_mean": 0.0640648953616619, "signal/brier_reward/group_std_mean": 0.08967790007591248, "signal/brier_reward/group_zero_std_frac": 0.16111111044883727, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03203244768083095, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03203244768083095, "signal/confidence_one_or_zero/centered_abs_mean": 0.0061794706154614685, "signal/confidence_one_or_zero/group_std_mean": 0.010753947403281927, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9583333373069763, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 6.179470339873205e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 6.179470339873205e-08, "signal/format_reward/centered_abs_mean": 0.006293402798473835, "signal/format_reward/group_std_mean": 0.012858380377292634, "signal/format_reward/group_zero_std_frac": 0.9444444656372071, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0031467013992369174, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0031467013992369174, "signal/mean_confidence_reward/centered_abs_mean": 0.052287458628416064, "signal/mean_confidence_reward/group_std_mean": 0.0705066367983818, "signal/mean_confidence_reward/group_zero_std_frac": 0.18611111342906952, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.228745692420488e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.228745692420488e-07, "step": 1035 }, { "calibration/aurc": 0.08820866690711632, "calibration/batch_distribution_entropy": 0.5409087978174177, "calibration/confidence_entropy": 0.43112268296746803, "calibration/coverage@0%": 0.17187910104986875, "calibration/coverage@1%": 0.2651082677165354, "calibration/coverage@10%": 0.6223999343832022, "calibration/coverage@15%": 0.7838336614173228, "calibration/coverage@20%": 0.8637344160104987, "calibration/coverage@25%": 0.943626968503937, "calibration/coverage@30%": 0.9786458333333332, "calibration/coverage@5%": 0.41823326771653546, "calibration/ece": 0.10233103674540675, "calibration/mean_confidence": 0.7693274278215225, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008072916666666674, "completions/max_length": 4021.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 1139.0602294921875, "completions/mean_terminated_length": 1148.7132568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 261.4, "epoch": 2.499193760077999, "grad_norm": 0.0004894222947768867, "learning_rate": 0.0, "loss": -0.0099, "num_tokens": 2834917089.0, "reward": 1.3173441171646119, "reward_std": 0.10435323715209961, "rewards/accuracy_reward": 0.7634548544883728, "rewards/brier_reward": 0.879204511642456, "rewards/confidence_one_or_zero": 0.004166666709352285, "rewards/format_reward": 0.9920138955116272, "rewards/mean_confidence_reward": 0.7441588521003724, "signal/accuracy_reward/centered_abs_mean": 0.09453667551279069, "signal/accuracy_reward/group_std_mean": 0.13257256895303726, "signal/accuracy_reward/group_zero_std_frac": 0.5833333492279053, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.04726833775639534, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.04726833775639534, "signal/advantage_abs_mean": 0.07265690118074417, "signal/advantage_pre_scale_abs_mean": 0.07265690118074417, "signal/advantage_pre_scale_std": 0.1631344735622406, "signal/advantage_std": 0.1631344735622406, "signal/brier_reward/centered_abs_mean": 0.06591500863432884, "signal/brier_reward/group_std_mean": 0.09113981872797013, "signal/brier_reward/group_zero_std_frac": 0.18055555522441863, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.03295750431716442, "signal/brier_reward/weight": 0.5, "signal/brier_reward/weighted_centered_abs_mean": 0.03295750431716442, "signal/confidence_one_or_zero/centered_abs_mean": 0.005946180666796863, "signal/confidence_one_or_zero/group_std_mean": 0.010098458267748356, "signal/confidence_one_or_zero/group_zero_std_frac": 0.9611111164093018, "signal/confidence_one_or_zero/scaled_weighted_centered_abs_mean": 5.9461805790306245e-08, "signal/confidence_one_or_zero/weight": 9.999999747378752e-06, "signal/confidence_one_or_zero/weighted_centered_abs_mean": 5.9461805790306245e-08, "signal/format_reward/centered_abs_mean": 0.013107638619840146, "signal/format_reward/group_std_mean": 0.022080105543136597, "signal/format_reward/group_zero_std_frac": 0.919444465637207, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.006553819309920073, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006553819309920073, "signal/mean_confidence_reward/centered_abs_mean": 0.055751146376132966, "signal/mean_confidence_reward/group_std_mean": 0.07500464767217636, "signal/mean_confidence_reward/group_zero_std_frac": 0.20277777910232545, "signal/mean_confidence_reward/scaled_weighted_centered_abs_mean": 5.575114357725397e-07, "signal/mean_confidence_reward/weight": 9.999999747378752e-06, "signal/mean_confidence_reward/weighted_centered_abs_mean": 5.575114357725397e-07, "step": 1040 }, { "epoch": 2.499193760077999, "step": 1040, "total_flos": 0.0, "train_loss": -0.007999900917974838, "train_runtime": 243209.3794, "train_samples_per_second": 0.308, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 1040, "num_input_tokens_seen": 2834917089, "num_train_epochs": 3, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }