{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.49919376007799904, "eval_steps": 50, "global_step": 208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "calibration/aurc": 0.5174302011976861, "calibration/batch_distribution_entropy": 0.27968643045408775, "calibration/batch_entropy_100bins": 0.3491582241290684, "calibration/batch_entropy_10bins": 0.27968643045408775, "calibration/batch_entropy_50bins": 0.408646715708448, "calibration/batch_uniqueness": 0.5034224970546688, "calibration/confidence_entropy": 0.222429721256218, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/distribution_entropy_10": 0.27968643045408775, "calibration/distribution_entropy_100": 0.3491582241290684, "calibration/ece": 0.47095527672812854, "calibration/mean_confidence": 0.9153333731605595, "calibration/unique_confidence_per_question": 0.03072916666666667, "calibration/unique_confidences": 11.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.019704861111111138, "completions/max_length": 4029.6, "completions/max_terminated_length": 4029.6, "completions/mean_length": 513.3373291015625, "completions/mean_terminated_length": 523.6432006835937, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.011999850001874977, "grad_norm": 0.009435656480491161, "learning_rate": 5.952380952380953e-07, "loss": 0.0081, "num_tokens": 9027854.0, "reward": 0.434605211019516, "reward_std": 0.3864780366420746, "rewards/accuracy_reward": 0.26076388657093047, "rewards/batch_coverage_0": 0.0075095250271260735, "rewards/brier_reward": 0.3133890748023987, "rewards/confidence_uniqueness_reward": 0.2916687369346619, "rewards/format_reward": 0.6003472089767456, "rewards/frontier_entropy_batch_reward": -0.5720709562301636, "signal/accuracy_reward/centered_abs_mean": 0.3107530415058136, "signal/accuracy_reward/group_std_mean": 0.36877918243408203, "signal/accuracy_reward/group_zero_std_frac": 0.09722222313284874, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.4251779973506927, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1553765207529068, "signal/advantage_abs_mean": 0.8601556777954101, "signal/advantage_pre_scale_abs_mean": 0.334391725063324, "signal/advantage_pre_scale_std": 0.3892968833446503, "signal/advantage_std": 0.9841951966285706, "signal/batch_coverage_0/centered_abs_mean": 0.01619186196476221, "signal/batch_coverage_0/group_std_mean": 0.03339236527681351, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.004439523350447417, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.001619186159223318, "signal/brier_reward/centered_abs_mean": 0.32154104113578796, "signal/brier_reward/group_std_mean": 0.37417513132095337, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08799342513084411, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.032154104113578795, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.2379334509372711, "signal/confidence_uniqueness_reward/group_std_mean": 0.28854405879974365, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06512594521045685, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.023793345689773558, "signal/format_reward/centered_abs_mean": 0.4406900942325592, "signal/format_reward/group_std_mean": 0.47528126239776614, "signal/format_reward/group_zero_std_frac": 0.0, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.603149700164795, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.2203450471162796, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.45061137676239016, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4829135715961456, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.123350390791893, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.04506114050745964, "step": 5 }, { "calibration/aurc": 0.538458366076803, "calibration/batch_distribution_entropy": 0.27025803806008913, "calibration/batch_entropy_100bins": 0.35320084312111905, "calibration/batch_entropy_10bins": 0.27025803806008913, "calibration/batch_entropy_50bins": 0.40764678867964327, "calibration/batch_uniqueness": 0.5098736001992761, "calibration/confidence_entropy": 0.2274931285048082, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/distribution_entropy_10": 0.27025803806008913, "calibration/distribution_entropy_100": 0.35320084312111905, "calibration/ece": 0.469860126629539, "calibration/mean_confidence": 0.9163531404806801, "calibration/unique_confidence_per_question": 0.034895833333333334, "calibration/unique_confidences": 13.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017013888888888884, "completions/max_length": 4031.8, "completions/max_terminated_length": 4031.8, "completions/mean_length": 475.27578125, "completions/mean_terminated_length": 483.72108154296876, "completions/min_length": 0.0, "completions/min_terminated_length": 23.4, "epoch": 0.023999700003749954, "grad_norm": 0.007094210479408503, "learning_rate": 1.1904761904761906e-06, "loss": -0.0012, "num_tokens": 17585751.0, "reward": 0.5200118303298951, "reward_std": 0.35560383200645446, "rewards/accuracy_reward": 0.29826388955116273, "rewards/batch_coverage_0": 0.008394665271043777, "rewards/brier_reward": 0.36398414969444276, "rewards/confidence_uniqueness_reward": 0.37082897424697875, "rewards/format_reward": 0.7326388716697693, "rewards/frontier_entropy_batch_reward": -0.6976032853126526, "signal/accuracy_reward/centered_abs_mean": 0.32623698115348815, "signal/accuracy_reward/group_std_mean": 0.38321188688278196, "signal/accuracy_reward/group_zero_std_frac": 0.0777777798473835, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.5054798066616059, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.16311849057674407, "signal/advantage_abs_mean": 0.8196688890457153, "signal/advantage_pre_scale_abs_mean": 0.2959678113460541, "signal/advantage_pre_scale_std": 0.35853559374809263, "signal/advantage_std": 0.9841603994369507, "signal/batch_coverage_0/centered_abs_mean": 0.016383717581629754, "signal/batch_coverage_0/group_std_mean": 0.03455512970685959, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.005117970705032349, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.0016383717767894268, "signal/brier_reward/centered_abs_mean": 0.3206613898277283, "signal/brier_reward/group_std_mean": 0.3725072264671326, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0991722971200943, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.032066140323877335, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.21787003576755523, "signal/confidence_uniqueness_reward/group_std_mean": 0.27440804839134214, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06710911989212036, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.021787003427743912, "signal/format_reward/centered_abs_mean": 0.33837890625, "signal/format_reward/group_std_mean": 0.4072873592376709, "signal/format_reward/group_zero_std_frac": 0.008333333395421505, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.5145779550075531, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.169189453125, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3695851922035217, "signal/frontier_entropy_batch_reward/group_std_mean": 0.43336753249168397, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.00555555559694767, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.11289727687835693, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.036958518996834756, "step": 10 }, { "calibration/aurc": 0.5306261075067018, "calibration/batch_distribution_entropy": 0.32752820170291713, "calibration/batch_entropy_100bins": 0.37619434130013085, "calibration/batch_entropy_10bins": 0.32752820170291713, "calibration/batch_entropy_50bins": 0.43504399957185147, "calibration/batch_uniqueness": 0.560448811522282, "calibration/confidence_entropy": 0.26276109891710886, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.0, "calibration/coverage@25%": 0.0, "calibration/coverage@30%": 0.0, "calibration/coverage@5%": 0.0, "calibration/distribution_entropy_10": 0.32752820170291713, "calibration/distribution_entropy_100": 0.37619434130013085, "calibration/ece": 0.481128862912443, "calibration/mean_confidence": 0.902030702661782, "calibration/unique_confidence_per_question": 0.0375, "calibration/unique_confidences": 14.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009114583333333325, "completions/max_length": 3973.0, "completions/max_terminated_length": 3973.0, "completions/mean_length": 421.3002563476563, "completions/mean_terminated_length": 425.16774291992186, "completions/min_length": 0.0, "completions/min_terminated_length": 50.6, "epoch": 0.03599955000562493, "grad_norm": 0.00401267409324646, "learning_rate": 1.7857142857142859e-06, "loss": -0.0301, "num_tokens": 25541114.0, "reward": 0.6512134194374084, "reward_std": 0.2598669737577438, "rewards/accuracy_reward": 0.3289930522441864, "rewards/batch_coverage_0": 0.010935892909765243, "rewards/brier_reward": 0.4457497835159302, "rewards/confidence_uniqueness_reward": 0.5453853368759155, "rewards/format_reward": 0.9550347328186035, "rewards/frontier_entropy_batch_reward": -0.9100759267807007, "signal/accuracy_reward/centered_abs_mean": 0.32265625, "signal/accuracy_reward/group_std_mean": 0.38412655591964723, "signal/accuracy_reward/group_zero_std_frac": 0.06944444701075554, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.7317090034484863, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.161328125, "signal/advantage_abs_mean": 0.7629533171653747, "signal/advantage_pre_scale_abs_mean": 0.20725361704826356, "signal/advantage_pre_scale_std": 0.2652052193880081, "signal/advantage_std": 0.9840136289596557, "signal/batch_coverage_0/centered_abs_mean": 0.02262604646384716, "signal/batch_coverage_0/group_std_mean": 0.042140249907970426, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.010491053014993668, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.0022626047022640703, "signal/brier_reward/centered_abs_mean": 0.2939529836177826, "signal/brier_reward/group_std_mean": 0.3465470314025879, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1333274409174919, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.029395297914743424, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.1786189019680023, "signal/confidence_uniqueness_reward/group_std_mean": 0.2248939424753189, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.08192214220762253, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.01786189079284668, "signal/format_reward/centered_abs_mean": 0.08039279617369174, "signal/format_reward/group_std_mean": 0.1518175147473812, "signal/format_reward/group_zero_std_frac": 0.3999999947845936, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.1705910697579384, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.04019639808684587, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.1546709954738617, "signal/frontier_entropy_batch_reward/group_std_mean": 0.26556792855262756, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.12500000409781933, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.06849853545427323, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.015467099472880363, "step": 15 }, { "calibration/aurc": 0.431346821288402, "calibration/batch_distribution_entropy": 0.5426059445625475, "calibration/batch_entropy_100bins": 0.45494946806997305, "calibration/batch_entropy_10bins": 0.5426059445625475, "calibration/batch_entropy_50bins": 0.5300555115048834, "calibration/batch_uniqueness": 0.6894089034323752, "calibration/buffer_distribution_entropy": 0.3539083886858205, "calibration/buffer_entropy_100bins": 0.39710971496026076, "calibration/buffer_entropy_10bins": 0.3539083886858205, "calibration/buffer_entropy_50bins": 0.4590901769775059, "calibration/confidence_entropy": 0.37666589181304533, "calibration/coverage@0%": 0.0, "calibration/coverage@1%": 0.0, "calibration/coverage@10%": 0.0, "calibration/coverage@15%": 0.0, "calibration/coverage@20%": 0.013648293963254593, "calibration/coverage@25%": 0.09658792650918635, "calibration/coverage@30%": 0.14482142611754684, "calibration/coverage@5%": 0.0, "calibration/distribution_entropy_10": 0.5426059445625475, "calibration/distribution_entropy_100": 0.45494946806997305, "calibration/ece": 0.3324109675010253, "calibration/mean_confidence": 0.842016994994642, "calibration/unique_confidence_per_question": 0.04895833333333333, "calibration/unique_confidences": 18.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010069444444444442, "completions/max_length": 3539.4, "completions/max_terminated_length": 3539.4, "completions/mean_length": 462.4025146484375, "completions/mean_terminated_length": 467.1513671875, "completions/min_length": 0.0, "completions/min_terminated_length": 93.8, "epoch": 0.04799940000749991, "grad_norm": 0.003982796333730221, "learning_rate": 2.380952380952381e-06, "loss": -0.0258, "num_tokens": 33981687.0, "reward": 0.7565704345703125, "reward_std": 0.21907421946525574, "rewards/accuracy_reward": 0.45008680820465086, "rewards/batch_coverage_0": 0.04114052355289459, "rewards/brier_reward": 0.5963123440742493, "rewards/confidence_uniqueness_reward": 0.6829663753509522, "rewards/format_reward": 0.9855902791023254, "rewards/frontier_entropy_batch_reward": -0.9331006526947021, "signal/accuracy_reward/centered_abs_mean": 0.290185546875, "signal/accuracy_reward/group_std_mean": 0.3576755583286285, "signal/accuracy_reward/group_zero_std_frac": 0.08055555745959282, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9244364380836487, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.1450927734375, "signal/advantage_abs_mean": 0.7477807521820068, "signal/advantage_pre_scale_abs_mean": 0.17386467158794403, "signal/advantage_pre_scale_std": 0.22889118492603303, "signal/advantage_std": 0.9838362455368042, "signal/batch_coverage_0/centered_abs_mean": 0.04352298155426979, "signal/batch_coverage_0/group_std_mean": 0.06958689764142037, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.028244782984256745, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.004352298146113753, "signal/brier_reward/centered_abs_mean": 0.23803514540195464, "signal/brier_reward/group_std_mean": 0.2906170547008514, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15154335796833038, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.02380351461470127, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.13516120612621307, "signal/confidence_uniqueness_reward/group_std_mean": 0.16643594652414323, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.08516475707292556, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.013516120798885822, "signal/format_reward/centered_abs_mean": 0.02657335102558136, "signal/format_reward/group_std_mean": 0.05870797857642174, "signal/format_reward/group_zero_std_frac": 0.7305555701255798, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.08531446680426598, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01328667551279068, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.11649189740419388, "signal/frontier_entropy_batch_reward/group_std_mean": 0.2214953511953354, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.24166666865348815, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.07496421933174133, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.011649190448224545, "step": 20 }, { "calibration/aurc": 0.35370092548420196, "calibration/batch_distribution_entropy": 0.7403816854106197, "calibration/batch_entropy_100bins": 0.6544776213166862, "calibration/batch_entropy_10bins": 0.7403816854106197, "calibration/batch_entropy_50bins": 0.7077410795972892, "calibration/batch_uniqueness": 0.8446109592830314, "calibration/buffer_distribution_entropy": 0.4704443038922532, "calibration/buffer_entropy_100bins": 0.45799401070372914, "calibration/buffer_entropy_10bins": 0.4704443038922532, "calibration/buffer_entropy_50bins": 0.5275006311693058, "calibration/confidence_entropy": 0.5280357127233377, "calibration/coverage@0%": 0.003674540682414698, "calibration/coverage@1%": 0.003674540682414698, "calibration/coverage@10%": 0.013746021109063495, "calibration/coverage@15%": 0.05981931953838286, "calibration/coverage@20%": 0.08077959217446043, "calibration/coverage@25%": 0.21680172311625281, "calibration/coverage@30%": 0.4438294080768707, "calibration/coverage@5%": 0.003674540682414698, "calibration/distribution_entropy_10": 0.7403816854106197, "calibration/distribution_entropy_100": 0.6544776213166862, "calibration/ece": 0.16746795065959824, "calibration/mean_confidence": 0.7183618971817334, "calibration/unique_confidence_per_question": 0.1578125, "calibration/unique_confidences": 60.6, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01015625, "completions/max_length": 3957.0, "completions/max_terminated_length": 3957.0, "completions/mean_length": 515.8568664550781, "completions/mean_terminated_length": 521.1552551269531, "completions/min_length": 0.0, "completions/min_terminated_length": 92.6, "epoch": 0.05999925000937488, "grad_norm": 0.0032299933955073357, "learning_rate": 2.9761904761904763e-06, "loss": -0.021, "num_tokens": 43048806.0, "reward": 0.8601099848747253, "reward_std": 0.1965584486722946, "rewards/accuracy_reward": 0.5633680582046509, "rewards/batch_coverage_0": 0.07260396629571915, "rewards/brier_reward": 0.7115841269493103, "rewards/confidence_uniqueness_reward": 0.8290335893630981, "rewards/format_reward": 0.9865451455116272, "rewards/frontier_entropy_batch_reward": -0.7616880893707275, "signal/accuracy_reward/centered_abs_mean": 0.25604383945465087, "signal/accuracy_reward/group_std_mean": 0.3211396872997284, "signal/accuracy_reward/group_zero_std_frac": 0.1555555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8837929010391236, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.12802191972732543, "signal/advantage_abs_mean": 0.7358854413032532, "signal/advantage_pre_scale_abs_mean": 0.1512439429759979, "signal/advantage_pre_scale_std": 0.20898248255252838, "signal/advantage_std": 0.983783233165741, "signal/batch_coverage_0/centered_abs_mean": 0.0846152812242508, "signal/batch_coverage_0/group_std_mean": 0.1144055426120758, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.058388042449951175, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.00846152864396572, "signal/brier_reward/centered_abs_mean": 0.16035446226596833, "signal/brier_reward/group_std_mean": 0.20569514036178588, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.11123347133398057, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.016035445593297483, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.09056732803583145, "signal/confidence_uniqueness_reward/group_std_mean": 0.12100362330675125, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.062135671824216844, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.009056732803583146, "signal/format_reward/centered_abs_mean": 0.02431098110973835, "signal/format_reward/group_std_mean": 0.049627327173948285, "signal/format_reward/group_zero_std_frac": 0.7861111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.08382118195295334, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.012155490554869175, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3184548020362854, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4208760499954224, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.033333333767950536, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.21904338300228118, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03184547908604145, "step": 25 }, { "calibration/aurc": 0.2839966804132776, "calibration/batch_distribution_entropy": 0.9605051655041095, "calibration/batch_entropy_100bins": 0.9429200926612158, "calibration/batch_entropy_10bins": 0.9605051655041095, "calibration/batch_entropy_50bins": 0.9610896445243384, "calibration/batch_uniqueness": 0.9511843537092707, "calibration/buffer_distribution_entropy": 0.6156539225956406, "calibration/buffer_entropy_100bins": 0.601263494323698, "calibration/buffer_entropy_10bins": 0.6156539225956406, "calibration/buffer_entropy_50bins": 0.6609181402206332, "calibration/confidence_entropy": 0.5267294948163934, "calibration/coverage@0%": 0.006397462349291899, "calibration/coverage@1%": 0.006397462349291899, "calibration/coverage@10%": 0.006397462349291899, "calibration/coverage@15%": 0.023509761814532543, "calibration/coverage@20%": 0.10798261586113189, "calibration/coverage@25%": 0.34141912353301845, "calibration/coverage@30%": 0.822169028871391, "calibration/coverage@5%": 0.006397462349291899, "calibration/distribution_entropy_10": 0.9605051655041095, "calibration/distribution_entropy_100": 0.9429200926612158, "calibration/ece": 0.22083162645645044, "calibration/mean_confidence": 0.5505724597008637, "calibration/unique_confidence_per_question": 0.8807291666666666, "calibration/unique_confidences": 338.2, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01605902777777777, "completions/max_length": 3942.2, "completions/max_terminated_length": 3942.2, "completions/mean_length": 583.0853393554687, "completions/mean_terminated_length": 592.7757568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 117.6, "epoch": 0.07199910001124986, "grad_norm": 0.002717937109991908, "learning_rate": 3.5714285714285718e-06, "loss": -0.0357, "num_tokens": 52875869.0, "reward": 0.9348342657089234, "reward_std": 0.18317421972751619, "rewards/accuracy_reward": 0.5981770753860474, "rewards/batch_coverage_0": 0.10351773351430893, "rewards/brier_reward": 0.7025912404060364, "rewards/confidence_uniqueness_reward": 0.9337292194366456, "rewards/format_reward": 0.9822048664093017, "rewards/frontier_entropy_batch_reward": -0.293405282497406, "signal/accuracy_reward/centered_abs_mean": 0.23839518129825593, "signal/accuracy_reward/group_std_mean": 0.2983840018510818, "signal/accuracy_reward/group_zero_std_frac": 0.21111111491918563, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.869914174079895, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.11919759064912797, "signal/advantage_abs_mean": 0.7542917847633361, "signal/advantage_pre_scale_abs_mean": 0.14086539298295975, "signal/advantage_pre_scale_std": 0.19977498352527617, "signal/advantage_std": 0.983738362789154, "signal/batch_coverage_0/centered_abs_mean": 0.19907444715499878, "signal/batch_coverage_0/group_std_mean": 0.26103257536888125, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.14895964413881302, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.019907445646822453, "signal/brier_reward/centered_abs_mean": 0.21252276003360748, "signal/brier_reward/group_std_mean": 0.2615444421768188, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.15797891467809677, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.02125227525830269, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.04080349802970886, "signal/confidence_uniqueness_reward/group_std_mean": 0.06831279695034027, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.030214974656701088, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004080350138247013, "signal/format_reward/centered_abs_mean": 0.02994249165058136, "signal/format_reward/group_std_mean": 0.055678685754537584, "signal/format_reward/group_zero_std_frac": 0.7722222208976746, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.11192618533968926, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01497124582529068, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3531898558139801, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4236706256866455, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.2580131709575653, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03531898595392704, "step": 30 }, { "calibration/aurc": 0.22170318832234343, "calibration/batch_distribution_entropy": 0.9421379128373853, "calibration/batch_entropy_100bins": 0.939229582486355, "calibration/batch_entropy_10bins": 0.9421379128373853, "calibration/batch_entropy_50bins": 0.9489073127515452, "calibration/batch_uniqueness": 0.9435042531493842, "calibration/buffer_distribution_entropy": 0.7322581373218676, "calibration/buffer_entropy_100bins": 0.7159217758015332, "calibration/buffer_entropy_10bins": 0.7322581373218676, "calibration/buffer_entropy_50bins": 0.7623503142731999, "calibration/confidence_entropy": 0.45037526062650085, "calibration/coverage@0%": 0.004776460128240555, "calibration/coverage@1%": 0.004776460128240555, "calibration/coverage@10%": 0.019253671924487202, "calibration/coverage@15%": 0.1755596710110715, "calibration/coverage@20%": 0.5139170791128755, "calibration/coverage@25%": 0.7677293737551006, "calibration/coverage@30%": 0.908045643554782, "calibration/coverage@5%": 0.004776460128240555, "calibration/distribution_entropy_10": 0.9421379128373853, "calibration/distribution_entropy_100": 0.939229582486355, "calibration/ece": 0.1732557673693993, "calibration/mean_confidence": 0.6251879703657945, "calibration/unique_confidence_per_question": 0.9864583333333334, "calibration/unique_confidences": 378.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021788194444444464, "completions/max_length": 3869.6, "completions/max_terminated_length": 3869.6, "completions/mean_length": 676.70703125, "completions/mean_terminated_length": 691.8087524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 173.8, "epoch": 0.08399895001312484, "grad_norm": 0.00269313994795084, "learning_rate": 4.166666666666667e-06, "loss": -0.0507, "num_tokens": 63748974.0, "reward": 0.9618130803108216, "reward_std": 0.1690991222858429, "rewards/accuracy_reward": 0.6490451335906983, "rewards/batch_coverage_0": 0.1625123217701912, "rewards/brier_reward": 0.7147271633148193, "rewards/confidence_uniqueness_reward": 0.9211899161338806, "rewards/format_reward": 0.9766493082046509, "rewards/frontier_entropy_batch_reward": -0.30877079665660856, "signal/accuracy_reward/centered_abs_mean": 0.19120551347732545, "signal/accuracy_reward/group_std_mean": 0.2488324373960495, "signal/accuracy_reward/group_zero_std_frac": 0.3027777761220932, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9168439865112304, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.09560275673866273, "signal/advantage_abs_mean": 0.736086618900299, "signal/advantage_pre_scale_abs_mean": 0.12685162276029588, "signal/advantage_pre_scale_std": 0.1916327953338623, "signal/advantage_std": 0.9835097432136536, "signal/batch_coverage_0/centered_abs_mean": 0.22149142622947693, "signal/batch_coverage_0/group_std_mean": 0.2891678690910339, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21263793408870696, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.02214914225041866, "signal/brier_reward/centered_abs_mean": 0.23408417999744416, "signal/brier_reward/group_std_mean": 0.28517125248909, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.22420990467071533, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.023408417776226997, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.053552377969026566, "signal/confidence_uniqueness_reward/group_std_mean": 0.08374377638101578, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05234308801591396, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0053552379831671715, "signal/format_reward/centered_abs_mean": 0.03601888045668602, "signal/format_reward/group_std_mean": 0.06319156214594841, "signal/format_reward/group_zero_std_frac": 0.7527777791023255, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.17084376215934755, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01800944022834301, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.381756055355072, "signal/frontier_entropy_batch_reward/group_std_mean": 0.44897361397743224, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.37677479684352877, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0381756067276001, "step": 35 }, { "calibration/aurc": 0.24541412524387626, "calibration/batch_distribution_entropy": 0.9383568457569865, "calibration/batch_entropy_100bins": 0.9411353191318031, "calibration/batch_entropy_10bins": 0.9383568457569865, "calibration/batch_entropy_50bins": 0.9473751559506512, "calibration/batch_uniqueness": 0.9451766886989332, "calibration/buffer_distribution_entropy": 0.7766920853518905, "calibration/buffer_entropy_100bins": 0.7748170257856153, "calibration/buffer_entropy_10bins": 0.7766920853518905, "calibration/buffer_entropy_50bins": 0.8095014447600581, "calibration/confidence_entropy": 0.5105895803297336, "calibration/coverage@0%": 0.011043239043669717, "calibration/coverage@1%": 0.011043239043669717, "calibration/coverage@10%": 0.03677228627460259, "calibration/coverage@15%": 0.08397971397598639, "calibration/coverage@20%": 0.33836452037838355, "calibration/coverage@25%": 0.6420780927200319, "calibration/coverage@30%": 0.8848675040846755, "calibration/coverage@5%": 0.011043239043669717, "calibration/distribution_entropy_10": 0.9383568457569865, "calibration/distribution_entropy_100": 0.9411353191318031, "calibration/ece": 0.15390421541967228, "calibration/mean_confidence": 0.6241893393048994, "calibration/unique_confidence_per_question": 0.9890625, "calibration/unique_confidences": 379.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01883680555555558, "completions/max_length": 3759.2, "completions/max_terminated_length": 3759.2, "completions/mean_length": 753.6845581054688, "completions/mean_terminated_length": 768.2565063476562, "completions/min_length": 0.0, "completions/min_terminated_length": 226.2, "epoch": 0.09599880001499982, "grad_norm": 0.013629751279950142, "learning_rate": 4.761904761904762e-06, "loss": -0.0464, "num_tokens": 75550940.0, "reward": 0.97404465675354, "reward_std": 0.15893589854240417, "rewards/accuracy_reward": 0.6609375, "rewards/batch_coverage_0": 0.19304660856723785, "rewards/brier_reward": 0.7446058511734008, "rewards/confidence_uniqueness_reward": 0.9269584774971008, "rewards/format_reward": 0.9806423664093018, "rewards/frontier_entropy_batch_reward": -0.3320638656616211, "signal/accuracy_reward/centered_abs_mean": 0.17389322817325592, "signal/accuracy_reward/group_std_mean": 0.23077381253242493, "signal/accuracy_reward/group_zero_std_frac": 0.347222226858139, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9138347387313843, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08694661408662796, "signal/advantage_abs_mean": 0.7391909837722779, "signal/advantage_pre_scale_abs_mean": 0.11861821860074998, "signal/advantage_pre_scale_std": 0.18562033176422119, "signal/advantage_std": 0.9834268927574158, "signal/batch_coverage_0/centered_abs_mean": 0.1839183449745178, "signal/batch_coverage_0/group_std_mean": 0.24077706336975097, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.1939650923013687, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.018391834944486617, "signal/brier_reward/centered_abs_mean": 0.19649418592453002, "signal/brier_reward/group_std_mean": 0.24308202862739564, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20717605352401733, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.019649419561028482, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.045768487453460696, "signal/confidence_uniqueness_reward/group_std_mean": 0.07060145288705826, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04819247126579285, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004576848912984132, "signal/format_reward/centered_abs_mean": 0.03188476637005806, "signal/format_reward/group_std_mean": 0.05429861918091774, "signal/format_reward/group_zero_std_frac": 0.7972222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.16629874408245088, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01594238318502903, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.38026103377342224, "signal/frontier_entropy_batch_reward/group_std_mean": 0.44629310369491576, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4036199152469635, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.038026104867458346, "step": 40 }, { "calibration/aurc": 0.17401281085281242, "calibration/batch_distribution_entropy": 0.9257879515650167, "calibration/batch_entropy_100bins": 0.9322180704729982, "calibration/batch_entropy_10bins": 0.9257879515650167, "calibration/batch_entropy_50bins": 0.9403918369687571, "calibration/batch_uniqueness": 0.9413443750086088, "calibration/buffer_distribution_entropy": 0.8118769211393018, "calibration/buffer_entropy_100bins": 0.8163705193225421, "calibration/buffer_entropy_10bins": 0.8118769211393018, "calibration/buffer_entropy_50bins": 0.8437264714491857, "calibration/confidence_entropy": 0.4751647145810791, "calibration/coverage@0%": 0.018616960887388335, "calibration/coverage@1%": 0.018616960887388335, "calibration/coverage@10%": 0.11996558000338249, "calibration/coverage@15%": 0.31641030253688285, "calibration/coverage@20%": 0.8198981422435141, "calibration/coverage@25%": 0.9679174333611573, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.04231194351809785, "calibration/distribution_entropy_10": 0.9257879515650167, "calibration/distribution_entropy_100": 0.9322180704729982, "calibration/ece": 0.15314606802324127, "calibration/mean_confidence": 0.6360751615599385, "calibration/unique_confidence_per_question": 0.9807291666666667, "calibration/unique_confidences": 376.6, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017361111111111095, "completions/max_length": 3637.2, "completions/max_terminated_length": 3637.2, "completions/mean_length": 718.997998046875, "completions/mean_terminated_length": 731.70673828125, "completions/min_length": 0.0, "completions/min_terminated_length": 221.0, "epoch": 0.1079986500168748, "grad_norm": 0.003335554851219058, "learning_rate": 4.909638554216868e-06, "loss": -0.0454, "num_tokens": 86969061.0, "reward": 0.9880038857460022, "reward_std": 0.15024617910385132, "rewards/accuracy_reward": 0.6736979126930237, "rewards/batch_coverage_0": 0.2329769492149353, "rewards/brier_reward": 0.7637104630470276, "rewards/confidence_uniqueness_reward": 0.9286281347274781, "rewards/format_reward": 0.9825520753860474, "rewards/frontier_entropy_batch_reward": -0.3265267163515091, "signal/accuracy_reward/centered_abs_mean": 0.17314996123313903, "signal/accuracy_reward/group_std_mean": 0.2271723985671997, "signal/accuracy_reward/group_zero_std_frac": 0.3583333373069763, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9806798100471497, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08657498061656951, "signal/advantage_abs_mean": 0.7340533494949341, "signal/advantage_pre_scale_abs_mean": 0.11192792057991027, "signal/advantage_pre_scale_std": 0.17584609091281891, "signal/advantage_std": 0.9833369493484497, "signal/batch_coverage_0/centered_abs_mean": 0.1993844449520111, "signal/batch_coverage_0/group_std_mean": 0.25716363787651064, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2259293019771576, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.0199384443461895, "signal/brier_reward/centered_abs_mean": 0.18940032422542571, "signal/brier_reward/group_std_mean": 0.23733858466148378, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.2157978594303131, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.01894003227353096, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.041976536810398105, "signal/confidence_uniqueness_reward/group_std_mean": 0.06569402515888215, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.048248696699738504, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004197653895244002, "signal/format_reward/centered_abs_mean": 0.02760959230363369, "signal/format_reward/group_std_mean": 0.048797968029975894, "signal/format_reward/group_zero_std_frac": 0.8083333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.15639951974153518, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.013804796151816845, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.36036349534988404, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4273237884044647, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41718631982803345, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.036036350578069684, "step": 45 }, { "calibration/aurc": 0.3161069454398612, "calibration/batch_distribution_entropy": 0.9644760827545266, "calibration/batch_entropy_100bins": 0.9511358010751498, "calibration/batch_entropy_10bins": 0.9644760827545266, "calibration/batch_entropy_50bins": 0.9631218868935271, "calibration/batch_uniqueness": 0.9489304890631937, "calibration/buffer_distribution_entropy": 0.8392170070252319, "calibration/buffer_entropy_100bins": 0.8461842860655169, "calibration/buffer_entropy_10bins": 0.8392170070252319, "calibration/buffer_entropy_50bins": 0.8688301669951886, "calibration/confidence_entropy": 0.4740886147426032, "calibration/coverage@0%": 0.008463280011736029, "calibration/coverage@1%": 0.008463280011736029, "calibration/coverage@10%": 0.0237447516429417, "calibration/coverage@15%": 0.09319288994081405, "calibration/coverage@20%": 0.2054546457599283, "calibration/coverage@25%": 0.3819263744551913, "calibration/coverage@30%": 0.534008588531292, "calibration/coverage@5%": 0.008463280011736029, "calibration/distribution_entropy_10": 0.9644760827545266, "calibration/distribution_entropy_100": 0.9511358010751498, "calibration/ece": 0.15545994179292094, "calibration/mean_confidence": 0.5602707976117317, "calibration/unique_confidence_per_question": 0.9921875, "calibration/unique_confidences": 381.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01206597222222221, "completions/max_length": 3788.8, "completions/max_terminated_length": 3788.8, "completions/mean_length": 727.9027099609375, "completions/mean_terminated_length": 736.8207885742188, "completions/min_length": 0.0, "completions/min_terminated_length": 222.8, "epoch": 0.11999850001874976, "grad_norm": 0.003200685838237405, "learning_rate": 4.759036144578314e-06, "loss": -0.0375, "num_tokens": 98452100.0, "reward": 0.9884833931922913, "reward_std": 0.1402774930000305, "rewards/accuracy_reward": 0.6598958373069763, "rewards/batch_coverage_0": 0.2569040894508362, "rewards/brier_reward": 0.7681125044822693, "rewards/confidence_uniqueness_reward": 0.9333637595176697, "rewards/format_reward": 0.9877604246139526, "rewards/frontier_entropy_batch_reward": -0.3118279218673706, "signal/accuracy_reward/centered_abs_mean": 0.15991753339767456, "signal/accuracy_reward/group_std_mean": 0.21020196974277497, "signal/accuracy_reward/group_zero_std_frac": 0.40277777910232543, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9491617441177368, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07995876669883728, "signal/advantage_abs_mean": 0.7388450384140015, "signal/advantage_pre_scale_abs_mean": 0.1035029023885727, "signal/advantage_pre_scale_std": 0.16441345810890198, "signal/advantage_std": 0.983297336101532, "signal/batch_coverage_0/centered_abs_mean": 0.19529581367969512, "signal/batch_coverage_0/group_std_mean": 0.24793221652507783, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23113827407360077, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.01952958106994629, "signal/brier_reward/centered_abs_mean": 0.17141424715518952, "signal/brier_reward/group_std_mean": 0.21658135950565338, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.2038181960582733, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.01714142579585314, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.036786551028490065, "signal/confidence_uniqueness_reward/group_std_mean": 0.058603516221046446, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04392699599266052, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0036786553915590047, "signal/format_reward/centered_abs_mean": 0.021164279617369174, "signal/format_reward/group_std_mean": 0.04010897055268288, "signal/format_reward/group_zero_std_frac": 0.8361111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.12460318654775619, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010582139808684587, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3541642606258392, "signal/frontier_entropy_batch_reward/group_std_mean": 0.42405728101730344, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.42257370352745055, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03541642650961876, "step": 50 }, { "epoch": 0.11999850001874976, "eval_completions/clipped_ratio": 0.010416666666666649, "eval_completions/max_length": 2050.5, "eval_completions/max_terminated_length": 2050.5, "eval_completions/mean_length": 720.9482727050781, "eval_completions/mean_terminated_length": 728.4109090169271, "eval_completions/min_length": 96.16666666666667, "eval_completions/min_terminated_length": 252.5, "eval_loss": 0.0, "eval_num_tokens": 98452100.0, "eval_reward": 0.9005749424298605, "eval_reward_std": 0.2392077917853991, "eval_rewards/accuracy_reward": 0.6744791567325592, "eval_rewards/batch_coverage_0": 0.008232661212484041, "eval_rewards/brier_reward": 0.7813109854857127, "eval_rewards/confidence_uniqueness_reward": 0.885476420323054, "eval_rewards/format_reward": 0.9895833233992258, "eval_rewards/frontier_entropy_batch_reward": -0.9895833233992258, "eval_runtime": 179.5931, "eval_samples_per_second": 5.568, "eval_signal/accuracy_reward/centered_abs_mean": 0.4273545990387599, "eval_signal/accuracy_reward/group_std_mean": 0.4683712025483449, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9062869151433309, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.21367729951937994, "eval_signal/advantage_abs_mean": 0.8733128209908804, "eval_signal/advantage_pre_scale_abs_mean": 0.2092428207397461, "eval_signal/advantage_pre_scale_std": 0.23747060696283975, "eval_signal/advantage_std": 0.9864048759142557, "eval_signal/batch_coverage_0/centered_abs_mean": 0.17604178686936697, "eval_signal/batch_coverage_0/group_std_mean": 0.26295527070760727, "eval_signal/batch_coverage_0/group_zero_std_frac": 0.0, "eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.0748915175596873, "eval_signal/batch_coverage_0/weight": 0.10000000149011612, "eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.017604179369906586, "eval_signal/brier_reward/centered_abs_mean": 0.19752245396375656, "eval_signal/brier_reward/group_std_mean": 0.2500632430116336, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08338230103254318, "eval_signal/brier_reward/weight": 0.10000000149011612, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.019752246638139088, "eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.05403412195543448, "eval_signal/confidence_uniqueness_reward/group_std_mean": 0.08773908950388432, "eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.022890682332217693, "eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005403412350763877, "eval_signal/format_reward/centered_abs_mean": 0.019965277363856632, "eval_signal/format_reward/group_std_mean": 0.05294674759109815, "eval_signal/format_reward/group_zero_std_frac": 0.7222222288449606, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.04037875309586525, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.009982638681928316, "eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.019965277363856632, "eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.05294674759109815, "eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.7222222288449606, "eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.00807575105379025, "eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0019965277363856635, "eval_steps_per_second": 0.033, "step": 50 }, { "epoch": 0.11999850001874976, "step": 50, "train_probe_completions/clipped_ratio": 0.015277777777777779, "train_probe_completions/max_length": 2122.8333333333335, "train_probe_completions/max_terminated_length": 2122.8333333333335, "train_probe_completions/mean_length": 716.4770914713541, "train_probe_completions/mean_terminated_length": 727.799326578776, "train_probe_completions/min_length": 0.0, "train_probe_completions/min_terminated_length": 242.83333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 98452100.0, "train_probe_reward": 0.9082557559013367, "train_probe_reward_std": 0.2357780635356903, "train_probe_rewards/accuracy_reward": 0.6814236044883728, "train_probe_rewards/batch_coverage_0": 0.025458375923335552, "train_probe_rewards/brier_reward": 0.8000073134899139, "train_probe_rewards/confidence_uniqueness_reward": 0.8881678680578867, "train_probe_rewards/format_reward": 0.9904513855775198, "train_probe_rewards/frontier_entropy_batch_reward": -0.9904513855775198, "train_probe_runtime": 206.2151, "train_probe_samples_per_second": 4.849, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4191080729166667, "train_probe_signal/accuracy_reward/group_std_mean": 0.4634708563486735, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9035188257694244, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.20955403645833334, "train_probe_signal/advantage_abs_mean": 0.8659235934416453, "train_probe_signal/advantage_pre_scale_abs_mean": 0.20504274467627207, "train_probe_signal/advantage_pre_scale_std": 0.23454017440478006, "train_probe_signal/advantage_std": 0.9863985180854797, "train_probe_signal/batch_coverage_0/centered_abs_mean": 0.16327512512604395, "train_probe_signal/batch_coverage_0/group_std_mean": 0.23903479675451914, "train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0, "train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07043657451868057, "train_probe_signal/batch_coverage_0/weight": 0.10000000149011612, "train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.016327512450516224, "train_probe_signal/brier_reward/centered_abs_mean": 0.17697453250487646, "train_probe_signal/brier_reward/group_std_mean": 0.23091630637645721, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07631279900670052, "train_probe_signal/brier_reward/weight": 0.10000000149011612, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.017697453498840332, "train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.049318368236223854, "train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.08279193627337615, "train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02125573654969533, "train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004931837009886901, "train_probe_signal/format_reward/centered_abs_mean": 0.018500433613856632, "train_probe_signal/format_reward/group_std_mean": 0.054015101244052253, "train_probe_signal/format_reward/group_zero_std_frac": 0.6944444676240286, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.03978658188134432, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.009250216806928316, "train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.018500433613856632, "train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.054015101244052253, "train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.6944444676240286, "train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.007957316547011336, "train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0018500435204866033, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.22472360402010977, "calibration/batch_distribution_entropy": 0.9609207169219642, "calibration/batch_entropy_100bins": 0.9516821872459627, "calibration/batch_entropy_10bins": 0.9609207169219642, "calibration/batch_entropy_50bins": 0.9619270817134706, "calibration/batch_uniqueness": 0.9474521596011586, "calibration/buffer_distribution_entropy": 0.8653756366467846, "calibration/buffer_entropy_100bins": 0.8726938653024712, "calibration/buffer_entropy_10bins": 0.8653756366467846, "calibration/buffer_entropy_50bins": 0.8914594965398323, "calibration/confidence_entropy": 0.4753242399571679, "calibration/coverage@0%": 0.023157928701659374, "calibration/coverage@1%": 0.023157928701659374, "calibration/coverage@10%": 0.20255356707408798, "calibration/coverage@15%": 0.29319682499541944, "calibration/coverage@20%": 0.3988951974691966, "calibration/coverage@25%": 0.6721432631962692, "calibration/coverage@30%": 0.8863604042477233, "calibration/coverage@5%": 0.13758263084949302, "calibration/distribution_entropy_10": 0.9609207169219642, "calibration/distribution_entropy_100": 0.9516821872459627, "calibration/ece": 0.16699372970477053, "calibration/mean_confidence": 0.5923917872653834, "calibration/unique_confidence_per_question": 0.9854166666666668, "calibration/unique_confidences": 378.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010329861111111093, "completions/max_length": 3465.4, "completions/max_terminated_length": 3465.4, "completions/mean_length": 736.5135498046875, "completions/mean_terminated_length": 744.2725219726562, "completions/min_length": 0.0, "completions/min_terminated_length": 190.2, "epoch": 0.13199835002062474, "grad_norm": 0.004147018771618605, "learning_rate": 4.60843373493976e-06, "loss": -0.0263, "num_tokens": 110017312.0, "reward": 0.9964199542999268, "reward_std": 0.13358933180570604, "rewards/accuracy_reward": 0.6614583253860473, "rewards/batch_coverage_0": 0.2666813760995865, "rewards/brier_reward": 0.7828863620758056, "rewards/confidence_uniqueness_reward": 0.9379994750022889, "rewards/format_reward": 0.9893229007720947, "rewards/frontier_entropy_batch_reward": -0.2772741973400116, "signal/accuracy_reward/centered_abs_mean": 0.1580186665058136, "signal/accuracy_reward/group_std_mean": 0.20960874259471893, "signal/accuracy_reward/group_zero_std_frac": 0.38888888955116274, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.97646164894104, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0790093332529068, "signal/advantage_abs_mean": 0.7356102347373963, "signal/advantage_pre_scale_abs_mean": 0.09942354112863541, "signal/advantage_pre_scale_std": 0.1567298650741577, "signal/advantage_std": 0.9832465767860412, "signal/batch_coverage_0/centered_abs_mean": 0.18050281703472137, "signal/batch_coverage_0/group_std_mean": 0.2307935357093811, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22286758720874786, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.01805028095841408, "signal/brier_reward/centered_abs_mean": 0.1542533814907074, "signal/brier_reward/group_std_mean": 0.198762246966362, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19102311730384827, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.01542533803731203, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.030431923642754554, "signal/confidence_uniqueness_reward/group_std_mean": 0.04879971742630005, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03733023479580879, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003043192345649004, "signal/format_reward/centered_abs_mean": 0.016975911520421504, "signal/format_reward/group_std_mean": 0.03276526555418968, "signal/format_reward/group_zero_std_frac": 0.8611111164093017, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.10242971032857895, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.008487955760210752, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3267883062362671, "signal/frontier_entropy_batch_reward/group_std_mean": 0.3983007729053497, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4050808668136597, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03267883062362671, "step": 55 }, { "calibration/aurc": 0.2827445677122261, "calibration/batch_distribution_entropy": 0.9694165902358778, "calibration/batch_entropy_100bins": 0.9543224931993513, "calibration/batch_entropy_10bins": 0.9694165902358778, "calibration/batch_entropy_50bins": 0.9665939385794944, "calibration/batch_uniqueness": 0.9510640820618093, "calibration/buffer_distribution_entropy": 0.880743493705171, "calibration/buffer_entropy_100bins": 0.8888775798420593, "calibration/buffer_entropy_10bins": 0.880743493705171, "calibration/buffer_entropy_50bins": 0.904910653236246, "calibration/confidence_entropy": 0.4844323737164452, "calibration/coverage@0%": 0.0135758135283677, "calibration/coverage@1%": 0.0135758135283677, "calibration/coverage@10%": 0.08098455606724095, "calibration/coverage@15%": 0.3142387701904429, "calibration/coverage@20%": 0.4361154606476519, "calibration/coverage@25%": 0.5866153823361311, "calibration/coverage@30%": 0.679600433190058, "calibration/coverage@5%": 0.0135758135283677, "calibration/distribution_entropy_10": 0.9694165902358778, "calibration/distribution_entropy_100": 0.9543224931993513, "calibration/ece": 0.14178054807969714, "calibration/mean_confidence": 0.5263035042002124, "calibration/unique_confidence_per_question": 0.9895833333333334, "calibration/unique_confidences": 380.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012500000000000022, "completions/max_length": 3425.8, "completions/max_terminated_length": 3425.8, "completions/mean_length": 742.4217041015625, "completions/mean_terminated_length": 751.9814697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 229.2, "epoch": 0.14399820002249972, "grad_norm": 0.00289203436113894, "learning_rate": 4.457831325301205e-06, "loss": -0.0292, "num_tokens": 121666586.0, "reward": 0.9800794243812561, "reward_std": 0.14277701675891877, "rewards/accuracy_reward": 0.6308159708976746, "rewards/batch_coverage_0": 0.2683397114276886, "rewards/brier_reward": 0.7775330781936646, "rewards/confidence_uniqueness_reward": 0.9373286366462708, "rewards/format_reward": 0.9873263835906982, "rewards/frontier_entropy_batch_reward": -0.27311920523643496, "signal/accuracy_reward/centered_abs_mean": 0.174853515625, "signal/accuracy_reward/group_std_mean": 0.226035276055336, "signal/accuracy_reward/group_zero_std_frac": 0.37222222685813905, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0188154578208923, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0874267578125, "signal/advantage_abs_mean": 0.7372269749641418, "signal/advantage_pre_scale_abs_mean": 0.10599318891763687, "signal/advantage_pre_scale_std": 0.1663038432598114, "signal/advantage_std": 0.983307147026062, "signal/batch_coverage_0/centered_abs_mean": 0.18373815715312958, "signal/batch_coverage_0/group_std_mean": 0.23306281864643097, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21616858541965484, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.018373816460371017, "signal/brier_reward/centered_abs_mean": 0.16176026165485383, "signal/brier_reward/group_std_mean": 0.20582883358001708, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18924815058708191, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.016176026687026022, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.03418286256492138, "signal/confidence_uniqueness_reward/group_std_mean": 0.05919753760099411, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03993520997464657, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003418286330997944, "signal/format_reward/centered_abs_mean": 0.022428384982049465, "signal/format_reward/group_std_mean": 0.04563566856086254, "signal/format_reward/group_zero_std_frac": 0.8, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.12949552983045579, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.011214192491024733, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3305469274520874, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4020517349243164, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.38890537023544314, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033054694533348083, "step": 60 }, { "calibration/aurc": 0.17680634517317753, "calibration/batch_distribution_entropy": 0.9223857930936965, "calibration/batch_entropy_100bins": 0.9294423511504757, "calibration/batch_entropy_10bins": 0.9223857930936965, "calibration/batch_entropy_50bins": 0.9373286928503596, "calibration/batch_uniqueness": 0.9415649447642684, "calibration/buffer_distribution_entropy": 0.8967923017175998, "calibration/buffer_entropy_100bins": 0.903105019101886, "calibration/buffer_entropy_10bins": 0.8967923017175998, "calibration/buffer_entropy_50bins": 0.9176593724373063, "calibration/confidence_entropy": 0.4511241150397975, "calibration/coverage@0%": 0.033093853390412764, "calibration/coverage@1%": 0.033093853390412764, "calibration/coverage@10%": 0.5010672240079564, "calibration/coverage@15%": 0.5694402006428644, "calibration/coverage@20%": 0.6407083346546165, "calibration/coverage@25%": 0.7087965713901342, "calibration/coverage@30%": 0.7420715799184168, "calibration/coverage@5%": 0.3894755245494424, "calibration/distribution_entropy_10": 0.9223857930936965, "calibration/distribution_entropy_100": 0.9294423511504757, "calibration/ece": 0.13352572398916868, "calibration/mean_confidence": 0.6091362952833019, "calibration/unique_confidence_per_question": 0.9739583333333334, "calibration/unique_confidences": 374.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02065972222222221, "completions/max_length": 2984.6, "completions/max_terminated_length": 2984.6, "completions/mean_length": 691.2902954101562, "completions/mean_terminated_length": 705.7674682617187, "completions/min_length": 0.0, "completions/min_terminated_length": 152.8, "epoch": 0.1559980500243747, "grad_norm": 0.02314193919301033, "learning_rate": 4.307228915662651e-06, "loss": -0.0495, "num_tokens": 132724298.0, "reward": 0.9898407101631165, "reward_std": 0.1416984274983406, "rewards/accuracy_reward": 0.65703125, "rewards/batch_coverage_0": 0.31068081855773927, "rewards/brier_reward": 0.8045204639434814, "rewards/confidence_uniqueness_reward": 0.9254239439964295, "rewards/format_reward": 0.9787326455116272, "rewards/frontier_entropy_batch_reward": -0.32103757858276366, "signal/accuracy_reward/centered_abs_mean": 0.14537217915058137, "signal/accuracy_reward/group_std_mean": 0.19845550060272216, "signal/accuracy_reward/group_zero_std_frac": 0.40833333134651184, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9093841433525085, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07268608957529069, "signal/advantage_abs_mean": 0.7226288199424744, "signal/advantage_pre_scale_abs_mean": 0.10107557326555253, "signal/advantage_pre_scale_std": 0.171230810880661, "signal/advantage_std": 0.983215045928955, "signal/batch_coverage_0/centered_abs_mean": 0.17129134237766266, "signal/batch_coverage_0/group_std_mean": 0.21862670183181762, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2162470906972885, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.017129134200513364, "signal/brier_reward/centered_abs_mean": 0.14572922885417938, "signal/brier_reward/group_std_mean": 0.1897602289915085, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1829657733440399, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.014572923444211483, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.04564875364303589, "signal/confidence_uniqueness_reward/group_std_mean": 0.07703709006309509, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.057298554480075835, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004564875271171331, "signal/format_reward/centered_abs_mean": 0.03393554724752903, "signal/format_reward/group_std_mean": 0.06373440623283386, "signal/format_reward/group_zero_std_frac": 0.7361111164093017, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.21068618595600128, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.016967773623764514, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34571743607521055, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4146285653114319, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43939276933670046, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03457174450159073, "step": 65 }, { "calibration/aurc": 0.255712961146632, "calibration/batch_distribution_entropy": 0.9388658306441016, "calibration/batch_entropy_100bins": 0.9374460761801705, "calibration/batch_entropy_10bins": 0.9388658306441016, "calibration/batch_entropy_50bins": 0.9461765391704772, "calibration/batch_uniqueness": 0.943961165940431, "calibration/buffer_distribution_entropy": 0.9027612944060565, "calibration/buffer_entropy_100bins": 0.9118606001305496, "calibration/buffer_entropy_10bins": 0.9027612944060565, "calibration/buffer_entropy_50bins": 0.9241160182250978, "calibration/confidence_entropy": 0.441581652647624, "calibration/coverage@0%": 0.09105444880082295, "calibration/coverage@1%": 0.09606837637742462, "calibration/coverage@10%": 0.20810418588988497, "calibration/coverage@15%": 0.25194888797714865, "calibration/coverage@20%": 0.2779983592606131, "calibration/coverage@25%": 0.31211272924258926, "calibration/coverage@30%": 0.6796830076800402, "calibration/coverage@5%": 0.13181085772043027, "calibration/distribution_entropy_10": 0.9388658306441016, "calibration/distribution_entropy_100": 0.9374460761801705, "calibration/ece": 0.18058968785492258, "calibration/mean_confidence": 0.5582756787869785, "calibration/unique_confidence_per_question": 0.9671875, "calibration/unique_confidences": 371.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02960069444444442, "completions/max_length": 3201.8, "completions/max_terminated_length": 3201.8, "completions/mean_length": 635.1415893554688, "completions/mean_terminated_length": 654.58779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 174.4, "epoch": 0.16799790002624967, "grad_norm": 0.00906718336045742, "learning_rate": 4.156626506024097e-06, "loss": -0.0809, "num_tokens": 143119273.0, "reward": 0.9665384531021118, "reward_std": 0.159322065114975, "rewards/accuracy_reward": 0.6370659828186035, "rewards/batch_coverage_0": 0.2889437019824982, "rewards/brier_reward": 0.7667541742324829, "rewards/confidence_uniqueness_reward": 0.9159990549087524, "rewards/format_reward": 0.9701388835906982, "rewards/frontier_entropy_batch_reward": -0.3423368394374847, "signal/accuracy_reward/centered_abs_mean": 0.16377495527267455, "signal/accuracy_reward/group_std_mean": 0.212760990858078, "signal/accuracy_reward/group_zero_std_frac": 0.4083333373069763, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9339677453041076, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08188747763633727, "signal/advantage_abs_mean": 0.7247249484062195, "signal/advantage_pre_scale_abs_mean": 0.114870385825634, "signal/advantage_pre_scale_std": 0.1892496645450592, "signal/advantage_std": 0.9833415269851684, "signal/batch_coverage_0/centered_abs_mean": 0.19291015863418579, "signal/batch_coverage_0/group_std_mean": 0.2440529942512512, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21951412558555602, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.019291016831994058, "signal/brier_reward/centered_abs_mean": 0.17383241057395935, "signal/brier_reward/group_std_mean": 0.2209314674139023, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19668594300746917, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.017383241280913352, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.05868452787399292, "signal/confidence_uniqueness_reward/group_std_mean": 0.09797782599925994, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.0654455192387104, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005868452694267034, "signal/format_reward/centered_abs_mean": 0.04973958320915699, "signal/format_reward/group_std_mean": 0.08830733895301819, "signal/format_reward/group_zero_std_frac": 0.6583333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.2726438879966736, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.024869791604578494, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.36462308168411256, "signal/frontier_entropy_batch_reward/group_std_mean": 0.43168188333511354, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4166643261909485, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03646230846643448, "step": 70 }, { "calibration/aurc": 0.20402879477321081, "calibration/batch_distribution_entropy": 0.9100570338201137, "calibration/batch_entropy_100bins": 0.9255242664998121, "calibration/batch_entropy_10bins": 0.9100570338201137, "calibration/batch_entropy_50bins": 0.9331518001514901, "calibration/batch_uniqueness": 0.9388446130619105, "calibration/buffer_distribution_entropy": 0.9075425189572274, "calibration/buffer_entropy_100bins": 0.9188536488152922, "calibration/buffer_entropy_10bins": 0.9075425189572274, "calibration/buffer_entropy_50bins": 0.9292881590319055, "calibration/confidence_entropy": 0.4662604234655504, "calibration/coverage@0%": 0.01183253152297172, "calibration/coverage@1%": 0.01183253152297172, "calibration/coverage@10%": 0.3507062010567685, "calibration/coverage@15%": 0.4746198083784761, "calibration/coverage@20%": 0.6568163427547684, "calibration/coverage@25%": 0.7129386973791396, "calibration/coverage@30%": 0.7461053848765358, "calibration/coverage@5%": 0.06645065896718613, "calibration/distribution_entropy_10": 0.9100570338201137, "calibration/distribution_entropy_100": 0.9255242664998121, "calibration/ece": 0.1262777985805592, "calibration/mean_confidence": 0.6514417334269297, "calibration/unique_confidence_per_question": 0.975, "calibration/unique_confidences": 374.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.023263888888888883, "completions/max_length": 2466.4, "completions/max_terminated_length": 2466.4, "completions/mean_length": 585.1560791015625, "completions/mean_terminated_length": 599.1790893554687, "completions/min_length": 0.0, "completions/min_terminated_length": 172.2, "epoch": 0.17999775002812465, "grad_norm": 0.004078112542629242, "learning_rate": 4.006024096385543e-06, "loss": -0.0624, "num_tokens": 152925167.0, "reward": 0.9990050196647644, "reward_std": 0.15093266367912292, "rewards/accuracy_reward": 0.6853298544883728, "rewards/batch_coverage_0": 0.3030748426914215, "rewards/brier_reward": 0.8090933322906494, "rewards/confidence_uniqueness_reward": 0.9204216599464417, "rewards/format_reward": 0.976475715637207, "rewards/frontier_entropy_batch_reward": -0.3515673041343689, "signal/accuracy_reward/centered_abs_mean": 0.1615180104970932, "signal/accuracy_reward/group_std_mean": 0.21247020363807678, "signal/accuracy_reward/group_zero_std_frac": 0.4, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0372991800308227, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0807590052485466, "signal/advantage_abs_mean": 0.7307239651679993, "signal/advantage_pre_scale_abs_mean": 0.10921901911497116, "signal/advantage_pre_scale_std": 0.18343055248260498, "signal/advantage_std": 0.9831865429878235, "signal/batch_coverage_0/centered_abs_mean": 0.15994410514831542, "signal/batch_coverage_0/group_std_mean": 0.20556128919124603, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20666089951992034, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015994411334395407, "signal/brier_reward/centered_abs_mean": 0.14350559711456298, "signal/brier_reward/group_std_mean": 0.18532647788524628, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18572898209095, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.014350558631122112, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.052418901771306994, "signal/confidence_uniqueness_reward/group_std_mean": 0.0843198612332344, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06781294569373131, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005241890251636505, "signal/format_reward/centered_abs_mean": 0.04034830741584301, "signal/format_reward/group_std_mean": 0.07034000232815743, "signal/format_reward/group_zero_std_frac": 0.7361111283302307, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.2581541657447815, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.020174153707921506, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3405683159828186, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40916064381599426, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.44408488273620605, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0340568333864212, "step": 75 }, { "calibration/aurc": 0.21312748209126955, "calibration/batch_distribution_entropy": 0.9192534724128174, "calibration/batch_entropy_100bins": 0.9296142793543695, "calibration/batch_entropy_10bins": 0.9192534724128174, "calibration/batch_entropy_50bins": 0.9355600534978465, "calibration/batch_uniqueness": 0.9408131449770669, "calibration/buffer_distribution_entropy": 0.9106141449919584, "calibration/buffer_entropy_100bins": 0.9244551232284438, "calibration/buffer_entropy_10bins": 0.9106141449919584, "calibration/buffer_entropy_50bins": 0.9331678191582089, "calibration/confidence_entropy": 0.4631196946712599, "calibration/coverage@0%": 0.005760600891838782, "calibration/coverage@1%": 0.005760600891838782, "calibration/coverage@10%": 0.27219365801624273, "calibration/coverage@15%": 0.3836405255871157, "calibration/coverage@20%": 0.5819669751686798, "calibration/coverage@25%": 0.7454837778785506, "calibration/coverage@30%": 0.8588018973538969, "calibration/coverage@5%": 0.0398236205012986, "calibration/distribution_entropy_10": 0.9192534724128174, "calibration/distribution_entropy_100": 0.9296142793543695, "calibration/ece": 0.14439641011437374, "calibration/mean_confidence": 0.6531200786070206, "calibration/unique_confidence_per_question": 0.9916666666666666, "calibration/unique_confidences": 380.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01571180555555556, "completions/max_length": 3380.0, "completions/max_terminated_length": 3380.0, "completions/mean_length": 588.9006958007812, "completions/mean_terminated_length": 598.3701049804688, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.19199760002999963, "grad_norm": 0.00266805081628263, "learning_rate": 3.855421686746989e-06, "loss": -0.0537, "num_tokens": 162762583.0, "reward": 0.9848804593086242, "reward_std": 0.14701928794384003, "rewards/accuracy_reward": 0.6446180582046509, "rewards/batch_coverage_0": 0.28264726996421813, "rewards/brier_reward": 0.7860515952110291, "rewards/confidence_uniqueness_reward": 0.9323421716690063, "rewards/format_reward": 0.9842882037162781, "rewards/frontier_entropy_batch_reward": -0.296767795085907, "signal/accuracy_reward/centered_abs_mean": 0.16694878339767455, "signal/accuracy_reward/group_std_mean": 0.21657846570014955, "signal/accuracy_reward/group_zero_std_frac": 0.397222226858139, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9957749605178833, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08347439169883727, "signal/advantage_abs_mean": 0.7289156675338745, "signal/advantage_pre_scale_abs_mean": 0.10731721073389053, "signal/advantage_pre_scale_std": 0.17323702573776245, "signal/advantage_std": 0.983283269405365, "signal/batch_coverage_0/centered_abs_mean": 0.16982728838920594, "signal/batch_coverage_0/group_std_mean": 0.21664536595344544, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20365143120288848, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.016982728615403175, "signal/brier_reward/centered_abs_mean": 0.1532373309135437, "signal/brier_reward/group_std_mean": 0.1974636048078537, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18305174112319947, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.015323733352124691, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.04000471830368042, "signal/confidence_uniqueness_reward/group_std_mean": 0.07147712409496307, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.047922450304031375, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004000471858307719, "signal/format_reward/centered_abs_mean": 0.02828233428299427, "signal/format_reward/group_std_mean": 0.058084848523139956, "signal/format_reward/group_zero_std_frac": 0.7472222208976745, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.16909490078687667, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.014141167141497135, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.34231672883033754, "signal/frontier_entropy_batch_reward/group_std_mean": 0.41321871280670164, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4113028526306152, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034231672435998915, "step": 80 }, { "calibration/aurc": 0.1632135076835743, "calibration/batch_distribution_entropy": 0.9659063993196713, "calibration/batch_entropy_100bins": 0.9536454574623983, "calibration/batch_entropy_10bins": 0.9659063993196713, "calibration/batch_entropy_50bins": 0.965706134473739, "calibration/batch_uniqueness": 0.9492283081586897, "calibration/buffer_distribution_entropy": 0.915495808049182, "calibration/buffer_entropy_100bins": 0.9300554231101016, "calibration/buffer_entropy_10bins": 0.915495808049182, "calibration/buffer_entropy_50bins": 0.9376210976212376, "calibration/confidence_entropy": 0.45173476267042323, "calibration/coverage@0%": 0.01754124210834912, "calibration/coverage@1%": 0.01754124210834912, "calibration/coverage@10%": 0.3911290055639153, "calibration/coverage@15%": 0.496067459083426, "calibration/coverage@20%": 0.6235121405255534, "calibration/coverage@25%": 0.7748134369680345, "calibration/coverage@30%": 0.8836314921488823, "calibration/coverage@5%": 0.25191154408155897, "calibration/distribution_entropy_10": 0.9659063993196713, "calibration/distribution_entropy_100": 0.9536454574623983, "calibration/ece": 0.15774952942281292, "calibration/mean_confidence": 0.526197596741875, "calibration/unique_confidence_per_question": 0.9817708333333334, "calibration/unique_confidences": 377.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.020746527777777767, "completions/max_length": 3311.0, "completions/max_terminated_length": 3311.0, "completions/mean_length": 603.830908203125, "completions/mean_terminated_length": 616.63349609375, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.2039974500318746, "grad_norm": 0.0040805041790008545, "learning_rate": 3.7048192771084342e-06, "loss": -0.0584, "num_tokens": 172805915.0, "reward": 1.0033825635910034, "reward_std": 0.14983320534229277, "rewards/accuracy_reward": 0.68515625, "rewards/batch_coverage_0": 0.296441388130188, "rewards/brier_reward": 0.7884108066558838, "rewards/confidence_uniqueness_reward": 0.9267748475074769, "rewards/format_reward": 0.978125, "rewards/frontier_entropy_batch_reward": -0.2942079544067383, "signal/accuracy_reward/centered_abs_mean": 0.160009765625, "signal/accuracy_reward/group_std_mean": 0.2096061587333679, "signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0137329697608948, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0800048828125, "signal/advantage_abs_mean": 0.722009789943695, "signal/advantage_pre_scale_abs_mean": 0.10756922066211701, "signal/advantage_pre_scale_std": 0.18065415024757386, "signal/advantage_std": 0.9832139730453491, "signal/batch_coverage_0/centered_abs_mean": 0.1854369521141052, "signal/batch_coverage_0/group_std_mean": 0.23206678926944732, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23619085252285005, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.01854369565844536, "signal/brier_reward/centered_abs_mean": 0.15676779747009278, "signal/brier_reward/group_std_mean": 0.20251947045326232, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19893690645694734, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.01567677892744541, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.048682890459895135, "signal/confidence_uniqueness_reward/group_std_mean": 0.08520418256521226, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06293119937181473, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004868289083242416, "signal/format_reward/centered_abs_mean": 0.03848741352558136, "signal/format_reward/group_std_mean": 0.07385490760207176, "signal/format_reward/group_zero_std_frac": 0.6972222208976746, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.2500002160668373, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01924370676279068, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3430877685546875, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4109717309474945, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4361910581588745, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03430877774953842, "step": 85 }, { "calibration/aurc": 0.12301126961784772, "calibration/batch_distribution_entropy": 0.9331017295414563, "calibration/batch_entropy_100bins": 0.9381418424454226, "calibration/batch_entropy_10bins": 0.9331017295414563, "calibration/batch_entropy_50bins": 0.9462402462271073, "calibration/batch_uniqueness": 0.9423444019843231, "calibration/buffer_distribution_entropy": 0.9202602316335554, "calibration/buffer_entropy_100bins": 0.9351472758276472, "calibration/buffer_entropy_10bins": 0.9202602316335554, "calibration/buffer_entropy_50bins": 0.9418012111533827, "calibration/confidence_entropy": 0.4612580112288266, "calibration/coverage@0%": 0.08628837167868916, "calibration/coverage@1%": 0.12313788358115851, "calibration/coverage@10%": 0.49191285867993495, "calibration/coverage@15%": 0.6691996369614962, "calibration/coverage@20%": 0.8408953364435259, "calibration/coverage@25%": 0.9078131047843024, "calibration/coverage@30%": 0.9499502310974621, "calibration/coverage@5%": 0.2717661114689772, "calibration/distribution_entropy_10": 0.9331017295414563, "calibration/distribution_entropy_100": 0.9381418424454226, "calibration/ece": 0.12834374377029065, "calibration/mean_confidence": 0.6120755217205869, "calibration/unique_confidence_per_question": 0.9484375, "calibration/unique_confidences": 364.2, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.050694444444444445, "completions/max_length": 2518.6, "completions/max_terminated_length": 2518.6, "completions/mean_length": 554.4845703125, "completions/mean_terminated_length": 584.150537109375, "completions/min_length": 0.0, "completions/min_terminated_length": 158.4, "epoch": 0.2159973000337496, "grad_norm": 0.005482567939907312, "learning_rate": 3.5542168674698798e-06, "loss": -0.1229, "num_tokens": 182262249.0, "reward": 0.9727075695991516, "reward_std": 0.1719568520784378, "rewards/accuracy_reward": 0.6586805582046509, "rewards/batch_coverage_0": 0.31638202667236326, "rewards/brier_reward": 0.7820794343948364, "rewards/confidence_uniqueness_reward": 0.8977606773376465, "rewards/format_reward": 0.9493055462837219, "rewards/frontier_entropy_batch_reward": -0.3090770959854126, "signal/accuracy_reward/centered_abs_mean": 0.14839409589767455, "signal/accuracy_reward/group_std_mean": 0.1954744428396225, "signal/accuracy_reward/group_zero_std_frac": 0.4444444477558136, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8960744976997376, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07419704794883727, "signal/advantage_abs_mean": 0.7284149289131164, "signal/advantage_pre_scale_abs_mean": 0.12656941562891005, "signal/advantage_pre_scale_std": 0.21349311172962188, "signal/advantage_std": 0.9832640409469604, "signal/batch_coverage_0/centered_abs_mean": 0.1728483885526657, "signal/batch_coverage_0/group_std_mean": 0.21621764600276946, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20976388156414033, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.01728483885526657, "signal/brier_reward/centered_abs_mean": 0.16384423673152923, "signal/brier_reward/group_std_mean": 0.2075037896633148, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19847835898399352, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.016384424082934855, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.08556002229452134, "signal/confidence_uniqueness_reward/group_std_mean": 0.12629829049110414, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.10300634056329727, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.008556002285331488, "signal/format_reward/centered_abs_mean": 0.0777560755610466, "signal/format_reward/group_std_mean": 0.11778671145439149, "signal/format_reward/group_zero_std_frac": 0.6083333492279053, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.465447860956192, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.0388780377805233, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33695358633995054, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40606662034988406, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4137517988681793, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03369535878300667, "step": 90 }, { "calibration/aurc": 0.17871668297594143, "calibration/batch_distribution_entropy": 0.9630264574407592, "calibration/batch_entropy_100bins": 0.953211917202391, "calibration/batch_entropy_10bins": 0.9630264574407592, "calibration/batch_entropy_50bins": 0.9643523891448694, "calibration/batch_uniqueness": 0.9492980405470421, "calibration/buffer_distribution_entropy": 0.9238771059311623, "calibration/buffer_entropy_100bins": 0.9393248125495454, "calibration/buffer_entropy_10bins": 0.9238771059311623, "calibration/buffer_entropy_50bins": 0.9451495151031304, "calibration/confidence_entropy": 0.4737993854396472, "calibration/coverage@0%": 0.07179699490704275, "calibration/coverage@1%": 0.07179699490704275, "calibration/coverage@10%": 0.37936506941291626, "calibration/coverage@15%": 0.5761405617386479, "calibration/coverage@20%": 0.63375926663008, "calibration/coverage@25%": 0.6948149018962416, "calibration/coverage@30%": 0.7423140495867768, "calibration/coverage@5%": 0.15318227629232414, "calibration/distribution_entropy_10": 0.9630264574407592, "calibration/distribution_entropy_100": 0.953211917202391, "calibration/ece": 0.14511776380834202, "calibration/mean_confidence": 0.5647102282125718, "calibration/unique_confidence_per_question": 0.9744791666666668, "calibration/unique_confidences": 374.2, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03229166666666668, "completions/max_length": 3165.2, "completions/max_terminated_length": 3165.2, "completions/mean_length": 608.8990478515625, "completions/mean_terminated_length": 629.1340454101562, "completions/min_length": 0.0, "completions/min_terminated_length": 162.4, "epoch": 0.22799715003562457, "grad_norm": 0.003710468765348196, "learning_rate": 3.4036144578313257e-06, "loss": -0.09, "num_tokens": 192368446.0, "reward": 0.9795109272003174, "reward_std": 0.15554179847240449, "rewards/accuracy_reward": 0.6442708373069763, "rewards/batch_coverage_0": 0.32638593316078185, "rewards/brier_reward": 0.7930331349372863, "rewards/confidence_uniqueness_reward": 0.9162774443626404, "rewards/format_reward": 0.9677083373069764, "rewards/frontier_entropy_batch_reward": -0.30048283040523527, "signal/accuracy_reward/centered_abs_mean": 0.14325086772441864, "signal/accuracy_reward/group_std_mean": 0.19051359593868256, "signal/accuracy_reward/group_zero_std_frac": 0.45555556416511533, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.91033376455307, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07162543386220932, "signal/advantage_abs_mean": 0.7236616730690002, "signal/advantage_pre_scale_abs_mean": 0.11148717403411865, "signal/advantage_pre_scale_std": 0.18938724100589752, "signal/advantage_std": 0.9832141995429993, "signal/batch_coverage_0/centered_abs_mean": 0.165391007065773, "signal/batch_coverage_0/group_std_mean": 0.21068246960639953, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2101570636034012, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.016539101675152778, "signal/brier_reward/centered_abs_mean": 0.15271125733852386, "signal/brier_reward/group_std_mean": 0.19472625851631165, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19408320188522338, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.015271125920116901, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.061642435193061826, "signal/confidence_uniqueness_reward/group_std_mean": 0.09935838431119919, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07879238203167915, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006164243910461664, "signal/format_reward/centered_abs_mean": 0.05253906324505806, "signal/format_reward/group_std_mean": 0.08931401669979096, "signal/format_reward/group_zero_std_frac": 0.669444453716278, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.335909104347229, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.02626953162252903, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32878002524375916, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40042484402656553, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4183509528636932, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.032878002524375914, "step": 95 }, { "calibration/aurc": 0.1454110969088863, "calibration/batch_distribution_entropy": 0.9613831954056984, "calibration/batch_entropy_100bins": 0.9493371779893444, "calibration/batch_entropy_10bins": 0.9613831954056984, "calibration/batch_entropy_50bins": 0.9593376772305942, "calibration/batch_uniqueness": 0.9485584531953885, "calibration/buffer_distribution_entropy": 0.9277519896654418, "calibration/buffer_entropy_100bins": 0.943189982754457, "calibration/buffer_entropy_10bins": 0.9277519896654418, "calibration/buffer_entropy_50bins": 0.9483683845977355, "calibration/confidence_entropy": 0.4596341598052634, "calibration/coverage@0%": 0.04663136393391499, "calibration/coverage@1%": 0.04663136393391499, "calibration/coverage@10%": 0.3281475131843537, "calibration/coverage@15%": 0.6423179933915689, "calibration/coverage@20%": 0.7606484419125537, "calibration/coverage@25%": 0.8922567125625301, "calibration/coverage@30%": 0.9526003355271648, "calibration/coverage@5%": 0.08622573245379841, "calibration/distribution_entropy_10": 0.9613831954056984, "calibration/distribution_entropy_100": 0.9493371779893444, "calibration/ece": 0.154073196829507, "calibration/mean_confidence": 0.5689832250180208, "calibration/unique_confidence_per_question": 0.9723958333333332, "calibration/unique_confidences": 373.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029340277777777767, "completions/max_length": 3467.8, "completions/max_terminated_length": 3467.8, "completions/mean_length": 663.86259765625, "completions/mean_terminated_length": 683.8996948242187, "completions/min_length": 0.0, "completions/min_terminated_length": 191.0, "epoch": 0.23999700003749952, "grad_norm": 0.019849933683872223, "learning_rate": 3.2530120481927713e-06, "loss": -0.0789, "num_tokens": 203115215.0, "reward": 0.9892123222351075, "reward_std": 0.1588016241788864, "rewards/accuracy_reward": 0.6633680582046508, "rewards/batch_coverage_0": 0.30833343267440794, "rewards/brier_reward": 0.7872236728668213, "rewards/confidence_uniqueness_reward": 0.919810950756073, "rewards/format_reward": 0.9704861164093017, "rewards/frontier_entropy_batch_reward": -0.2925153970718384, "signal/accuracy_reward/centered_abs_mean": 0.1627821147441864, "signal/accuracy_reward/group_std_mean": 0.2125428557395935, "signal/accuracy_reward/group_zero_std_frac": 0.4111111044883728, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0067266941070556, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0813910573720932, "signal/advantage_abs_mean": 0.7315644145011901, "signal/advantage_pre_scale_abs_mean": 0.1147262141108513, "signal/advantage_pre_scale_std": 0.189102640748024, "signal/advantage_std": 0.9832417488098144, "signal/batch_coverage_0/centered_abs_mean": 0.18047589659690857, "signal/batch_coverage_0/group_std_mean": 0.22958629131317138, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22420001327991484, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.01804758906364441, "signal/brier_reward/centered_abs_mean": 0.1647311270236969, "signal/brier_reward/group_std_mean": 0.20903607010841369, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20256026089191437, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.016473113372921944, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.05859675332903862, "signal/confidence_uniqueness_reward/group_std_mean": 0.09514907449483871, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07238487452268601, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005859675444662571, "signal/format_reward/centered_abs_mean": 0.04894748292863369, "signal/format_reward/group_std_mean": 0.08429807126522064, "signal/format_reward/group_zero_std_frac": 0.6833333373069763, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.29983839094638826, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.024473741464316846, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33697280287742615, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4080432951450348, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.420040899515152, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033697281032800674, "step": 100 }, { "epoch": 0.23999700003749952, "eval_completions/clipped_ratio": 0.021527777777777774, "eval_completions/max_length": 2685.8333333333335, "eval_completions/max_terminated_length": 2685.8333333333335, "eval_completions/mean_length": 686.6156514485677, "eval_completions/mean_terminated_length": 701.6330871582031, "eval_completions/min_length": 0.0, "eval_completions/min_terminated_length": 228.66666666666666, "eval_loss": 0.0, "eval_num_tokens": 203115215.0, "eval_reward": 0.8787461519241333, "eval_reward_std": 0.2547052949666977, "eval_rewards/accuracy_reward": 0.6397569477558136, "eval_rewards/batch_coverage_0": 0.02475695653508107, "eval_rewards/brier_reward": 0.7724942763646444, "eval_rewards/confidence_uniqueness_reward": 0.874758760134379, "eval_rewards/format_reward": 0.9791666666666666, "eval_rewards/frontier_entropy_batch_reward": -0.9791666666666666, "eval_runtime": 207.3585, "eval_samples_per_second": 4.823, "eval_signal/accuracy_reward/centered_abs_mean": 0.4423285573720932, "eval_signal/accuracy_reward/group_std_mean": 0.4765505294005076, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8821068207422892, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2211642786860466, "eval_signal/advantage_abs_mean": 0.8672481079896291, "eval_signal/advantage_pre_scale_abs_mean": 0.22098190089066824, "eval_signal/advantage_pre_scale_std": 0.25322698801755905, "eval_signal/advantage_std": 0.9864300489425659, "eval_signal/batch_coverage_0/centered_abs_mean": 0.19854053358236948, "eval_signal/batch_coverage_0/group_std_mean": 0.2833152214686076, "eval_signal/batch_coverage_0/group_zero_std_frac": 0.0, "eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07916944163541, "eval_signal/batch_coverage_0/weight": 0.10000000149011612, "eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.01985405369972189, "eval_signal/brier_reward/centered_abs_mean": 0.2045312076807022, "eval_signal/brier_reward/group_std_mean": 0.26509985824426013, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08176350096861522, "eval_signal/brier_reward/weight": 0.10000000149011612, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.020453120892246563, "eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.0659493872274955, "eval_signal/confidence_uniqueness_reward/group_std_mean": 0.12324381371339162, "eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.026245065964758396, "eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006594939002146323, "eval_signal/format_reward/centered_abs_mean": 0.03993055565903584, "eval_signal/format_reward/group_std_mean": 0.10589349828660488, "eval_signal/format_reward/group_zero_std_frac": 0.4444444527228673, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.07909915472070377, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.01996527782951792, "eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.03993055565903584, "eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.10589349828660488, "eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.4444444527228673, "eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.015819831596066553, "eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.003993055705601971, "eval_steps_per_second": 0.029, "step": 100 }, { "epoch": 0.23999700003749952, "step": 100, "train_probe_completions/clipped_ratio": 0.025868055555555585, "train_probe_completions/max_length": 2853.0, "train_probe_completions/max_terminated_length": 2853.0, "train_probe_completions/mean_length": 692.7083638509115, "train_probe_completions/mean_terminated_length": 711.0718892415365, "train_probe_completions/min_length": 0.0, "train_probe_completions/min_terminated_length": 184.5, "train_probe_loss": 0.0, "train_probe_num_tokens": 203115215.0, "train_probe_reward": 0.8864819407463074, "train_probe_reward_std": 0.25598659614721936, "train_probe_rewards/accuracy_reward": 0.6631944477558136, "train_probe_rewards/batch_coverage_0": 0.016140021577787895, "train_probe_rewards/brier_reward": 0.7635823786258698, "train_probe_rewards/confidence_uniqueness_reward": 0.8732908368110657, "train_probe_rewards/format_reward": 0.9739583432674408, "train_probe_rewards/frontier_entropy_batch_reward": -0.9739583432674408, "train_probe_runtime": 213.2664, "train_probe_samples_per_second": 4.689, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4340277810891469, "train_probe_signal/accuracy_reward/group_std_mean": 0.4723463257153829, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8704935510953268, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.21701389054457346, "train_probe_signal/advantage_abs_mean": 0.854099969069163, "train_probe_signal/advantage_pre_scale_abs_mean": 0.2182364066441854, "train_probe_signal/advantage_pre_scale_std": 0.2556636830170949, "train_probe_signal/advantage_std": 0.9864280919233958, "train_probe_signal/batch_coverage_0/centered_abs_mean": 0.2105137606461843, "train_probe_signal/batch_coverage_0/group_std_mean": 0.30563366661469143, "train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0, "train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.08499650408824284, "train_probe_signal/batch_coverage_0/weight": 0.10000000149011612, "train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.021051375816265743, "train_probe_signal/brier_reward/centered_abs_mean": 0.21621683984994888, "train_probe_signal/brier_reward/group_std_mean": 0.2767111559708913, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08661133795976639, "train_probe_signal/brier_reward/weight": 0.10000000149011612, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.021621684543788433, "train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.07127842927972476, "train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.12808794404069582, "train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.028270625198880833, "train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007127842943494518, "train_probe_signal/format_reward/centered_abs_mean": 0.04893663106486201, "train_probe_signal/format_reward/group_std_mean": 0.11285140831023455, "train_probe_signal/format_reward/group_zero_std_frac": 0.4722222362955411, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0958357664446036, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.024468315532431006, "train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.04893663106486201, "train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.11285140831023455, "train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.4722222362955411, "train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.019167154173677165, "train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0048936633781219525, "train_probe_steps_per_second": 0.028 }, { "calibration/aurc": 0.32709221037996683, "calibration/batch_distribution_entropy": 0.9072256917534363, "calibration/batch_entropy_100bins": 0.9236496931399495, "calibration/batch_entropy_10bins": 0.9072256917534363, "calibration/batch_entropy_50bins": 0.927115252359181, "calibration/batch_uniqueness": 0.9367734004126167, "calibration/buffer_distribution_entropy": 0.9319828898045401, "calibration/buffer_entropy_100bins": 0.9473722336840288, "calibration/buffer_entropy_10bins": 0.9319828898045401, "calibration/buffer_entropy_50bins": 0.9516387033787922, "calibration/confidence_entropy": 0.4495748465857103, "calibration/coverage@0%": 0.033092714381499924, "calibration/coverage@1%": 0.033092714381499924, "calibration/coverage@10%": 0.15477837853260484, "calibration/coverage@15%": 0.17701470819007947, "calibration/coverage@20%": 0.2013024808479678, "calibration/coverage@25%": 0.23189123197141104, "calibration/coverage@30%": 0.504432124883315, "calibration/coverage@5%": 0.12432486411686663, "calibration/distribution_entropy_10": 0.9072256917534363, "calibration/distribution_entropy_100": 0.9236496931399495, "calibration/ece": 0.15953709471533112, "calibration/mean_confidence": 0.6542690738937268, "calibration/unique_confidence_per_question": 0.9598958333333332, "calibration/unique_confidences": 368.6, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030208333333333327, "completions/max_length": 3699.2, "completions/max_terminated_length": 3699.2, "completions/mean_length": 666.5580078125, "completions/mean_terminated_length": 687.2699340820312, "completions/min_length": 0.0, "completions/min_terminated_length": 151.6, "epoch": 0.2519968500393745, "grad_norm": 0.07229447364807129, "learning_rate": 3.1024096385542172e-06, "loss": -0.0794, "num_tokens": 213870827.0, "reward": 0.9622564077377319, "reward_std": 0.16706106960773467, "rewards/accuracy_reward": 0.6428819298744202, "rewards/batch_coverage_0": 0.26482947170734406, "rewards/brier_reward": 0.7719120621681214, "rewards/confidence_uniqueness_reward": 0.9088397026062012, "rewards/format_reward": 0.9683159708976745, "rewards/frontier_entropy_batch_reward": -0.3790066301822662, "signal/accuracy_reward/centered_abs_mean": 0.16609157919883727, "signal/accuracy_reward/group_std_mean": 0.22139038443565368, "signal/accuracy_reward/group_zero_std_frac": 0.36666667461395264, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9980636358261108, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08304578959941863, "signal/advantage_abs_mean": 0.7221752285957337, "signal/advantage_pre_scale_abs_mean": 0.12088042944669723, "signal/advantage_pre_scale_std": 0.1985163629055023, "signal/advantage_std": 0.983276081085205, "signal/batch_coverage_0/centered_abs_mean": 0.15856701135635376, "signal/batch_coverage_0/group_std_mean": 0.20437993705272675, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.18988114297389985, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015856701508164407, "signal/brier_reward/centered_abs_mean": 0.16616459488868712, "signal/brier_reward/group_std_mean": 0.21281179487705232, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19939744472503662, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.016616460122168063, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.06404128819704055, "signal/confidence_uniqueness_reward/group_std_mean": 0.10072248876094818, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.07632052302360534, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006404128670692444, "signal/format_reward/centered_abs_mean": 0.05047200433909893, "signal/format_reward/group_std_mean": 0.08542852699756623, "signal/format_reward/group_zero_std_frac": 0.6861111164093018, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.29700973331928254, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.025236002169549465, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3599489688873291, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4289889752864838, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43422353863716123, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03599489852786064, "step": 105 }, { "calibration/aurc": 0.1445495184896525, "calibration/batch_distribution_entropy": 0.9549703359188151, "calibration/batch_entropy_100bins": 0.9496826389779457, "calibration/batch_entropy_10bins": 0.9549703359188151, "calibration/batch_entropy_50bins": 0.9596300438920267, "calibration/batch_uniqueness": 0.9464544686187354, "calibration/buffer_distribution_entropy": 0.9319959695872686, "calibration/buffer_entropy_100bins": 0.9489525242674711, "calibration/buffer_entropy_10bins": 0.9319959695872686, "calibration/buffer_entropy_50bins": 0.9523803251403093, "calibration/confidence_entropy": 0.4915952441003196, "calibration/coverage@0%": 0.04353477677542926, "calibration/coverage@1%": 0.04353477677542926, "calibration/coverage@10%": 0.42479945249786555, "calibration/coverage@15%": 0.575662717001929, "calibration/coverage@20%": 0.7581317016685564, "calibration/coverage@25%": 0.8631562383772495, "calibration/coverage@30%": 0.9326810288101164, "calibration/coverage@5%": 0.19128571148926302, "calibration/distribution_entropy_10": 0.9549703359188151, "calibration/distribution_entropy_100": 0.9496826389779457, "calibration/ece": 0.1360942299199772, "calibration/mean_confidence": 0.5788540314486034, "calibration/unique_confidence_per_question": 0.984375, "calibration/unique_confidences": 378.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.020225694444444442, "completions/max_length": 3580.6, "completions/max_terminated_length": 3580.6, "completions/mean_length": 682.2786499023438, "completions/mean_terminated_length": 696.4890258789062, "completions/min_length": 0.0, "completions/min_terminated_length": 196.4, "epoch": 0.2639967000412495, "grad_norm": 0.032034844160079956, "learning_rate": 2.9518072289156627e-06, "loss": -0.0528, "num_tokens": 224839125.0, "reward": 1.006770396232605, "reward_std": 0.15200525820255278, "rewards/accuracy_reward": 0.6963541746139527, "rewards/batch_coverage_0": 0.24029523730278016, "rewards/brier_reward": 0.7906111955642701, "rewards/confidence_uniqueness_reward": 0.9291220664978027, "rewards/format_reward": 0.9788194417953491, "rewards/frontier_entropy_batch_reward": -0.268192446231842, "signal/accuracy_reward/centered_abs_mean": 0.1638563334941864, "signal/accuracy_reward/group_std_mean": 0.22195173501968385, "signal/accuracy_reward/group_zero_std_frac": 0.35, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8842053294181824, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0819281667470932, "signal/advantage_abs_mean": 0.7184135675430298, "signal/advantage_pre_scale_abs_mean": 0.10958259105682373, "signal/advantage_pre_scale_std": 0.17912587225437165, "signal/advantage_std": 0.9834012746810913, "signal/batch_coverage_0/centered_abs_mean": 0.1551068753004074, "signal/batch_coverage_0/group_std_mean": 0.19941962957382203, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.16745518445968627, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015510687604546547, "signal/brier_reward/centered_abs_mean": 0.14882863759994508, "signal/brier_reward/group_std_mean": 0.1928631156682968, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16044945418834686, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.014882863871753216, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.041838298365473744, "signal/confidence_uniqueness_reward/group_std_mean": 0.06950674280524254, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04490874633193016, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004183829948306084, "signal/format_reward/centered_abs_mean": 0.03174913227558136, "signal/format_reward/group_std_mean": 0.057792513817548755, "signal/format_reward/group_zero_std_frac": 0.7666666746139527, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.16912963092327118, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01587456613779068, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.31741302013397216, "signal/frontier_entropy_batch_reward/group_std_mean": 0.3890170991420746, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.3446439206600189, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03174130246043205, "step": 110 }, { "calibration/aurc": 0.24997302512551908, "calibration/batch_distribution_entropy": 0.948771182454743, "calibration/batch_entropy_100bins": 0.9441092968976221, "calibration/batch_entropy_10bins": 0.948771182454743, "calibration/batch_entropy_50bins": 0.955403885102182, "calibration/batch_uniqueness": 0.9460239538764702, "calibration/buffer_distribution_entropy": 0.936164731264576, "calibration/buffer_entropy_100bins": 0.9522699329038218, "calibration/buffer_entropy_10bins": 0.936164731264576, "calibration/buffer_entropy_50bins": 0.9554519538114729, "calibration/confidence_entropy": 0.4640606387456156, "calibration/coverage@0%": 0.01047088844822663, "calibration/coverage@1%": 0.01047088844822663, "calibration/coverage@10%": 0.09750734469069952, "calibration/coverage@15%": 0.3214186635337727, "calibration/coverage@20%": 0.4891239852148862, "calibration/coverage@25%": 0.5843370164004027, "calibration/coverage@30%": 0.6630702631818101, "calibration/coverage@5%": 0.033055108306382655, "calibration/distribution_entropy_10": 0.948771182454743, "calibration/distribution_entropy_100": 0.9441092968976221, "calibration/ece": 0.1190749094862839, "calibration/mean_confidence": 0.5852567080340979, "calibration/unique_confidence_per_question": 0.9880208333333332, "calibration/unique_confidences": 379.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02152777777777779, "completions/max_length": 3657.2, "completions/max_terminated_length": 3657.2, "completions/mean_length": 685.8245727539063, "completions/mean_terminated_length": 701.0185913085937, "completions/min_length": 0.0, "completions/min_terminated_length": 190.6, "epoch": 0.27599655004312446, "grad_norm": 0.01640477403998375, "learning_rate": 2.8012048192771087e-06, "loss": -0.0579, "num_tokens": 235819024.0, "reward": 0.9847165465354919, "reward_std": 0.15014655888080597, "rewards/accuracy_reward": 0.6589409708976746, "rewards/batch_coverage_0": 0.27372472286224364, "rewards/brier_reward": 0.7927528858184815, "rewards/confidence_uniqueness_reward": 0.9244337797164917, "rewards/format_reward": 0.9782986044883728, "rewards/frontier_entropy_batch_reward": -0.3299441754817963, "signal/accuracy_reward/centered_abs_mean": 0.16461046040058136, "signal/accuracy_reward/group_std_mean": 0.21322064697742463, "signal/accuracy_reward/group_zero_std_frac": 0.4, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0106230854988099, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08230523020029068, "signal/advantage_abs_mean": 0.7277908205986023, "signal/advantage_pre_scale_abs_mean": 0.11005394756793976, "signal/advantage_pre_scale_std": 0.17997059226036072, "signal/advantage_std": 0.9832545518875122, "signal/batch_coverage_0/centered_abs_mean": 0.16282705068588257, "signal/batch_coverage_0/group_std_mean": 0.2089460790157318, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.19948122501373292, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.0162827055901289, "signal/brier_reward/centered_abs_mean": 0.15371213257312774, "signal/brier_reward/group_std_mean": 0.197787806391716, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18880485296249389, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.015371213294565677, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.046393326669931415, "signal/confidence_uniqueness_reward/group_std_mean": 0.07716395705938339, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05672838613390922, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004639332601800561, "signal/format_reward/centered_abs_mean": 0.03493923582136631, "signal/format_reward/group_std_mean": 0.06415542513132096, "signal/format_reward/group_zero_std_frac": 0.7416666507720947, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.21280283629894256, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.017469617910683154, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3413366138935089, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4106238782405853, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41952593326568605, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034133663028478624, "step": 115 }, { "calibration/aurc": 0.27797711120421953, "calibration/batch_distribution_entropy": 0.9583379741927558, "calibration/batch_entropy_100bins": 0.9493557397434813, "calibration/batch_entropy_10bins": 0.9583379741927558, "calibration/batch_entropy_50bins": 0.9613567988695655, "calibration/batch_uniqueness": 0.9479972252470749, "calibration/buffer_distribution_entropy": 0.9377245142123376, "calibration/buffer_entropy_100bins": 0.9542313286000972, "calibration/buffer_entropy_10bins": 0.9377245142123376, "calibration/buffer_entropy_50bins": 0.9569445223722793, "calibration/confidence_entropy": 0.45816828200534754, "calibration/coverage@0%": 0.025490337368947603, "calibration/coverage@1%": 0.025490337368947603, "calibration/coverage@10%": 0.20452225123288423, "calibration/coverage@15%": 0.2787226282083809, "calibration/coverage@20%": 0.5025003751481274, "calibration/coverage@25%": 0.5424779528109738, "calibration/coverage@30%": 0.5757674726677722, "calibration/coverage@5%": 0.07545553963080058, "calibration/distribution_entropy_10": 0.9583379741927558, "calibration/distribution_entropy_100": 0.9493557397434813, "calibration/ece": 0.14685373933054077, "calibration/mean_confidence": 0.5651412098927935, "calibration/unique_confidence_per_question": 0.9776041666666666, "calibration/unique_confidences": 375.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01961805555555556, "completions/max_length": 3235.8, "completions/max_terminated_length": 3235.8, "completions/mean_length": 642.6586059570312, "completions/mean_terminated_length": 655.4541381835937, "completions/min_length": 0.0, "completions/min_terminated_length": 154.8, "epoch": 0.28799640004499943, "grad_norm": 0.015719039365649223, "learning_rate": 2.6506024096385547e-06, "loss": -0.055, "num_tokens": 246304307.0, "reward": 0.9861522912979126, "reward_std": 0.14733969867229463, "rewards/accuracy_reward": 0.6541666746139526, "rewards/batch_coverage_0": 0.28446308374404905, "rewards/brier_reward": 0.7799728155136109, "rewards/confidence_uniqueness_reward": 0.927653694152832, "rewards/format_reward": 0.9799479246139526, "rewards/frontier_entropy_batch_reward": -0.30113983154296875, "signal/accuracy_reward/centered_abs_mean": 0.16872830241918563, "signal/accuracy_reward/group_std_mean": 0.21660387516021729, "signal/accuracy_reward/group_zero_std_frac": 0.4000000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0460703372955322, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08436415120959281, "signal/advantage_abs_mean": 0.7369883894920349, "signal/advantage_pre_scale_abs_mean": 0.10920974463224412, "signal/advantage_pre_scale_std": 0.1773868203163147, "signal/advantage_std": 0.9832382082939148, "signal/batch_coverage_0/centered_abs_mean": 0.17697641849517823, "signal/batch_coverage_0/group_std_mean": 0.22625457048416137, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2210148811340332, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.017697642371058465, "signal/brier_reward/centered_abs_mean": 0.15970203280448914, "signal/brier_reward/group_std_mean": 0.20490642786026, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19861160218715668, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.015970203652977945, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.04404056295752525, "signal/confidence_uniqueness_reward/group_std_mean": 0.07342168614268303, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.055048702657222746, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004404056537896395, "signal/format_reward/centered_abs_mean": 0.0325575090944767, "signal/format_reward/group_std_mean": 0.06021936684846878, "signal/format_reward/group_zero_std_frac": 0.7555555701255798, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.20300004184246062, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.01627875454723835, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3320270121097565, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4021295964717865, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4149349987506866, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03320269919931888, "step": 120 }, { "calibration/aurc": 0.1703143158395383, "calibration/batch_distribution_entropy": 0.8794585073573937, "calibration/batch_entropy_100bins": 0.9106494560303057, "calibration/batch_entropy_10bins": 0.8794585073573937, "calibration/batch_entropy_50bins": 0.9141756705557457, "calibration/batch_uniqueness": 0.9305113696453565, "calibration/buffer_distribution_entropy": 0.9404959265968345, "calibration/buffer_entropy_100bins": 0.9565428692936477, "calibration/buffer_entropy_10bins": 0.9404959265968345, "calibration/buffer_entropy_50bins": 0.9589872588913677, "calibration/confidence_entropy": 0.4253087345674131, "calibration/coverage@0%": 0.09905896770438485, "calibration/coverage@1%": 0.15895480103771814, "calibration/coverage@10%": 0.4012652997253823, "calibration/coverage@15%": 0.4946182213996466, "calibration/coverage@20%": 0.5521577999911764, "calibration/coverage@25%": 0.7789902226812445, "calibration/coverage@30%": 0.871652319279875, "calibration/coverage@5%": 0.3014460083867995, "calibration/distribution_entropy_10": 0.8794585073573937, "calibration/distribution_entropy_100": 0.9106494560303057, "calibration/ece": 0.16335647487649493, "calibration/mean_confidence": 0.6267505431558249, "calibration/unique_confidence_per_question": 0.990625, "calibration/unique_confidences": 380.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011631944444444464, "completions/max_length": 3299.4, "completions/max_terminated_length": 3299.4, "completions/mean_length": 625.4529663085938, "completions/mean_terminated_length": 632.8467529296875, "completions/min_length": 0.0, "completions/min_terminated_length": 126.4, "epoch": 0.2999962500468744, "grad_norm": 0.016347240656614304, "learning_rate": 2.5e-06, "loss": -0.0321, "num_tokens": 256627189.0, "reward": 1.0023396015167236, "reward_std": 0.1366099238395691, "rewards/accuracy_reward": 0.6794270873069763, "rewards/batch_coverage_0": 0.3166307270526886, "rewards/brier_reward": 0.8103621363639831, "rewards/confidence_uniqueness_reward": 0.9277669072151185, "rewards/format_reward": 0.9872395753860473, "rewards/frontier_entropy_batch_reward": -0.3646974146366119, "signal/accuracy_reward/centered_abs_mean": 0.16190863847732545, "signal/accuracy_reward/group_std_mean": 0.21255030035972594, "signal/accuracy_reward/group_zero_std_frac": 0.4083333373069763, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.1290634512901305, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.08095431923866273, "signal/advantage_abs_mean": 0.7284621834754944, "signal/advantage_pre_scale_abs_mean": 0.09951311200857163, "signal/advantage_pre_scale_std": 0.16377390921115875, "signal/advantage_std": 0.9830830574035645, "signal/batch_coverage_0/centered_abs_mean": 0.16607835292816162, "signal/batch_coverage_0/group_std_mean": 0.21322621107101442, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23173038959503173, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.016607835702598096, "signal/brier_reward/centered_abs_mean": 0.14223978221416472, "signal/brier_reward/group_std_mean": 0.1856682389974594, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.19865721762180327, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.01422397829592228, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.036846403032541275, "signal/confidence_uniqueness_reward/group_std_mean": 0.06017006188631058, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.05238675698637962, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0036846403498202564, "signal/format_reward/centered_abs_mean": 0.02101236991584301, "signal/format_reward/group_std_mean": 0.041347318515181544, "signal/format_reward/group_zero_std_frac": 0.8222222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.14590775668621064, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010506184957921504, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33323261737823484, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40458549857139586, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47181462645530703, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03332326300442219, "step": 125 }, { "calibration/aurc": 0.2300665525636191, "calibration/batch_distribution_entropy": 0.9289155334598537, "calibration/batch_entropy_100bins": 0.9339730823409967, "calibration/batch_entropy_10bins": 0.9289155334598537, "calibration/batch_entropy_50bins": 0.9422619695279744, "calibration/batch_uniqueness": 0.9426163443601956, "calibration/buffer_distribution_entropy": 0.9394723763941414, "calibration/buffer_entropy_100bins": 0.9569623231573903, "calibration/buffer_entropy_10bins": 0.9394723763941414, "calibration/buffer_entropy_50bins": 0.9588223705223482, "calibration/confidence_entropy": 0.44351004871113436, "calibration/coverage@0%": 0.0154343832961903, "calibration/coverage@1%": 0.0154343832961903, "calibration/coverage@10%": 0.20882918884792864, "calibration/coverage@15%": 0.3081636308958342, "calibration/coverage@20%": 0.38748598522643307, "calibration/coverage@25%": 0.5834554325420772, "calibration/coverage@30%": 0.8136605973157437, "calibration/coverage@5%": 0.1259290743318735, "calibration/distribution_entropy_10": 0.9289155334598537, "calibration/distribution_entropy_100": 0.9339730823409967, "calibration/ece": 0.14460488876040145, "calibration/mean_confidence": 0.5764310270988868, "calibration/unique_confidence_per_question": 0.9838541666666668, "calibration/unique_confidences": 377.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008854166666666653, "completions/max_length": 2783.2, "completions/max_terminated_length": 2783.2, "completions/mean_length": 621.380224609375, "completions/mean_terminated_length": 627.0984619140625, "completions/min_length": 0.0, "completions/min_terminated_length": 166.4, "epoch": 0.3119961000487494, "grad_norm": 0.006308301351964474, "learning_rate": 2.349397590361446e-06, "loss": -0.0272, "num_tokens": 266910289.0, "reward": 0.9967981100082397, "reward_std": 0.13324756622314454, "rewards/accuracy_reward": 0.6637152791023254, "rewards/batch_coverage_0": 0.2850965529680252, "rewards/brier_reward": 0.8013586163520813, "rewards/confidence_uniqueness_reward": 0.9356141209602356, "rewards/format_reward": 0.9899305462837219, "rewards/frontier_entropy_batch_reward": -0.32231796681880953, "signal/accuracy_reward/centered_abs_mean": 0.1640516459941864, "signal/accuracy_reward/group_std_mean": 0.21628546714782715, "signal/accuracy_reward/group_zero_std_frac": 0.38055555820465087, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0400630950927734, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0820258229970932, "signal/advantage_abs_mean": 0.7400351524353027, "signal/advantage_pre_scale_abs_mean": 0.09946749657392502, "signal/advantage_pre_scale_std": 0.1557171642780304, "signal/advantage_std": 0.9832014203071594, "signal/batch_coverage_0/centered_abs_mean": 0.15798864066600798, "signal/batch_coverage_0/group_std_mean": 0.20295707881450653, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.20411037504673005, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.01579886469990015, "signal/brier_reward/centered_abs_mean": 0.1348998188972473, "signal/brier_reward/group_std_mean": 0.1756774067878723, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1731540232896805, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.01348998285830021, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.031015625223517418, "signal/confidence_uniqueness_reward/group_std_mean": 0.04782265685498714, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04015704616904259, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003101562592200935, "signal/format_reward/centered_abs_mean": 0.016655815846752374, "signal/format_reward/group_std_mean": 0.03051824141293764, "signal/format_reward/group_zero_std_frac": 0.8777777791023255, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.10784837445244193, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.008327907923376187, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33705700635910035, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4063593685626984, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43399363160133364, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03370570130646229, "step": 130 }, { "calibration/aurc": 0.2010499629419042, "calibration/batch_distribution_entropy": 0.9453438710641814, "calibration/batch_entropy_100bins": 0.9417344096389343, "calibration/batch_entropy_10bins": 0.9453438710641814, "calibration/batch_entropy_50bins": 0.9516687021452327, "calibration/batch_uniqueness": 0.9439579691190396, "calibration/buffer_distribution_entropy": 0.9419838075350185, "calibration/buffer_entropy_100bins": 0.9589754047081909, "calibration/buffer_entropy_10bins": 0.9419838075350185, "calibration/buffer_entropy_50bins": 0.96063677512858, "calibration/confidence_entropy": 0.46578609429290746, "calibration/coverage@0%": 0.20367177080733212, "calibration/coverage@1%": 0.21411563503709713, "calibration/coverage@10%": 0.3944984289031729, "calibration/coverage@15%": 0.4422044108251454, "calibration/coverage@20%": 0.5067000551427593, "calibration/coverage@25%": 0.5444308952584296, "calibration/coverage@30%": 0.6695770806183983, "calibration/coverage@5%": 0.32690936871855925, "calibration/distribution_entropy_10": 0.9453438710641814, "calibration/distribution_entropy_100": 0.9417344096389343, "calibration/ece": 0.179603961910554, "calibration/mean_confidence": 0.5899262525496647, "calibration/unique_confidence_per_question": 0.9963541666666668, "calibration/unique_confidences": 382.6, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006076388888888862, "completions/max_length": 3393.2, "completions/max_terminated_length": 3393.2, "completions/mean_length": 594.621435546875, "completions/mean_terminated_length": 598.2032958984375, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.32399595005062437, "grad_norm": 0.010317265056073666, "learning_rate": 2.1987951807228917e-06, "loss": -0.0126, "num_tokens": 276853352.0, "reward": 1.0097585678100587, "reward_std": 0.12289493680000305, "rewards/accuracy_reward": 0.6751736044883728, "rewards/batch_coverage_0": 0.2959615230560303, "rewards/brier_reward": 0.8135364770889282, "rewards/confidence_uniqueness_reward": 0.9416105151176453, "rewards/format_reward": 0.99375, "rewards/frontier_entropy_batch_reward": -0.2981413632631302, "signal/accuracy_reward/centered_abs_mean": 0.15569661557674408, "signal/accuracy_reward/group_std_mean": 0.20491576492786406, "signal/accuracy_reward/group_zero_std_frac": 0.4138888895511627, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0529741644859314, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07784830778837204, "signal/advantage_abs_mean": 0.7428248643875122, "signal/advantage_pre_scale_abs_mean": 0.09276586472988128, "signal/advantage_pre_scale_std": 0.1466542512178421, "signal/advantage_std": 0.9831327557563782, "signal/batch_coverage_0/centered_abs_mean": 0.15543493628501892, "signal/batch_coverage_0/group_std_mean": 0.19824867248535155, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21039265990257264, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015543493255972863, "signal/brier_reward/centered_abs_mean": 0.12734594643115998, "signal/brier_reward/group_std_mean": 0.1648596316576004, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17233424186706542, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.012734594009816646, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.02444492354989052, "signal/confidence_uniqueness_reward/group_std_mean": 0.037316303700208664, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03319260329008102, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0024444923736155035, "signal/format_reward/centered_abs_mean": 0.01032986119389534, "signal/format_reward/group_std_mean": 0.020031702518463135, "signal/format_reward/group_zero_std_frac": 0.9138889074325561, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.07024011090397835, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00516493059694767, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3326454758644104, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40271806716918945, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4504189193248749, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03326454721391201, "step": 135 }, { "calibration/aurc": 0.11405225671003227, "calibration/batch_distribution_entropy": 0.9285432797078601, "calibration/batch_entropy_100bins": 0.9358449348977624, "calibration/batch_entropy_10bins": 0.9285432797078601, "calibration/batch_entropy_50bins": 0.9439536592451411, "calibration/batch_uniqueness": 0.9417474476857131, "calibration/buffer_distribution_entropy": 0.9481680561778119, "calibration/buffer_entropy_100bins": 0.9653146440201216, "calibration/buffer_entropy_10bins": 0.9481680561778119, "calibration/buffer_entropy_50bins": 0.9656940039647847, "calibration/confidence_entropy": 0.4419780302764675, "calibration/coverage@0%": 0.022544080303125238, "calibration/coverage@1%": 0.07241284670732472, "calibration/coverage@10%": 0.610283779597179, "calibration/coverage@15%": 0.7543212120658326, "calibration/coverage@20%": 0.8502777944575561, "calibration/coverage@25%": 0.9230536656641395, "calibration/coverage@30%": 0.9827837926509186, "calibration/coverage@5%": 0.31054481087742014, "calibration/distribution_entropy_10": 0.9285432797078601, "calibration/distribution_entropy_100": 0.9358449348977624, "calibration/ece": 0.12129232013378737, "calibration/mean_confidence": 0.6215515878447262, "calibration/unique_confidence_per_question": 0.9921875, "calibration/unique_confidences": 381.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00972222222222221, "completions/max_length": 3051.4, "completions/max_terminated_length": 3051.4, "completions/mean_length": 600.0604248046875, "completions/mean_terminated_length": 605.9631225585938, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.33599580005249935, "grad_norm": 0.004826192278414965, "learning_rate": 2.0481927710843377e-06, "loss": -0.0332, "num_tokens": 286870272.0, "reward": 1.0001838564872743, "reward_std": 0.1291646659374237, "rewards/accuracy_reward": 0.6635416626930237, "rewards/batch_coverage_0": 0.3250649034976959, "rewards/brier_reward": 0.8216192960739136, "rewards/confidence_uniqueness_reward": 0.9341175317764282, "rewards/format_reward": 0.9896701335906982, "rewards/frontier_entropy_batch_reward": -0.3450227200984955, "signal/accuracy_reward/centered_abs_mean": 0.14829643964767455, "signal/accuracy_reward/group_std_mean": 0.1988508701324463, "signal/accuracy_reward/group_zero_std_frac": 0.42222222685813904, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0239773392677307, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07414821982383728, "signal/advantage_abs_mean": 0.7267651915550232, "signal/advantage_pre_scale_abs_mean": 0.09410272091627121, "signal/advantage_pre_scale_std": 0.1546729475259781, "signal/advantage_std": 0.9830945611000061, "signal/batch_coverage_0/centered_abs_mean": 0.15927667319774627, "signal/batch_coverage_0/group_std_mean": 0.20444611310958863, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22108587622642517, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015927667170763014, "signal/brier_reward/centered_abs_mean": 0.13157424628734588, "signal/brier_reward/group_std_mean": 0.17232659459114075, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18126892745494844, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.01315742488950491, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.03295594230294228, "signal/confidence_uniqueness_reward/group_std_mean": 0.055471654236316684, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.044615660607814786, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003295594220981002, "signal/format_reward/centered_abs_mean": 0.018386501539498567, "signal/format_reward/group_std_mean": 0.03843978680670261, "signal/format_reward/group_zero_std_frac": 0.8277777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.11741685792803765, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009193250769749283, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3423487484455109, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4122697412967682, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4785375475883484, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034234875440597536, "step": 140 }, { "calibration/aurc": 0.17159364528919666, "calibration/batch_distribution_entropy": 0.9739341873668828, "calibration/batch_entropy_100bins": 0.9591711536611062, "calibration/batch_entropy_10bins": 0.9739341873668828, "calibration/batch_entropy_50bins": 0.9695936724147591, "calibration/batch_uniqueness": 0.9512091916551567, "calibration/buffer_distribution_entropy": 0.9587097224168127, "calibration/buffer_entropy_100bins": 0.9746741480605607, "calibration/buffer_entropy_10bins": 0.9587097224168127, "calibration/buffer_entropy_50bins": 0.9736304805691599, "calibration/confidence_entropy": 0.46751717394040365, "calibration/coverage@0%": 0.026225070786173317, "calibration/coverage@1%": 0.026225070786173317, "calibration/coverage@10%": 0.32045087218771207, "calibration/coverage@15%": 0.5025195465554969, "calibration/coverage@20%": 0.6570699821959377, "calibration/coverage@25%": 0.7701132983193975, "calibration/coverage@30%": 0.875789496312478, "calibration/coverage@5%": 0.09018677412997969, "calibration/distribution_entropy_10": 0.9739341873668828, "calibration/distribution_entropy_100": 0.9591711536611062, "calibration/ece": 0.15216885313654238, "calibration/mean_confidence": 0.5343308437468959, "calibration/unique_confidence_per_question": 0.9942708333333334, "calibration/unique_confidences": 381.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009722222222222188, "completions/max_length": 3167.2, "completions/max_terminated_length": 3167.2, "completions/mean_length": 608.954248046875, "completions/mean_terminated_length": 614.9598876953125, "completions/min_length": 0.0, "completions/min_terminated_length": 189.0, "epoch": 0.34799565005437433, "grad_norm": 0.0030671849381178617, "learning_rate": 1.8975903614457832e-06, "loss": -0.0267, "num_tokens": 296950033.0, "reward": 1.0304441452026367, "reward_std": 0.1264051616191864, "rewards/accuracy_reward": 0.7199652671813965, "rewards/batch_coverage_0": 0.29423797130584717, "rewards/brier_reward": 0.8002641916275024, "rewards/confidence_uniqueness_reward": 0.9402836084365844, "rewards/format_reward": 0.9901041507720947, "rewards/frontier_entropy_batch_reward": -0.28069123029708865, "signal/accuracy_reward/centered_abs_mean": 0.1349392354488373, "signal/accuracy_reward/group_std_mean": 0.19156003892421722, "signal/accuracy_reward/group_zero_std_frac": 0.4055555582046509, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8481933355331421, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06746961772441865, "signal/advantage_abs_mean": 0.7143575072288513, "signal/advantage_pre_scale_abs_mean": 0.08871349543333054, "signal/advantage_pre_scale_std": 0.14817543923854828, "signal/advantage_std": 0.9832207202911377, "signal/batch_coverage_0/centered_abs_mean": 0.15694086849689484, "signal/batch_coverage_0/group_std_mean": 0.1989602953195572, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.19811130166053773, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015694086998701097, "signal/brier_reward/centered_abs_mean": 0.13146734237670898, "signal/brier_reward/group_std_mean": 0.173081836104393, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16585685014724733, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.013146734982728957, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.028389297798275946, "signal/confidence_uniqueness_reward/group_std_mean": 0.047778960317373276, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.035612285137176514, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002838929882273078, "signal/format_reward/centered_abs_mean": 0.016731770522892476, "signal/format_reward/group_std_mean": 0.033717308193445206, "signal/format_reward/group_zero_std_frac": 0.8527777791023254, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.104011070728302, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.008365885261446238, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.329537171125412, "signal/frontier_entropy_batch_reward/group_std_mean": 0.39947680830955506, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.41544281244277953, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.032953718304634096, "step": 145 }, { "calibration/aurc": 0.17155475015271432, "calibration/batch_distribution_entropy": 0.9508950837242326, "calibration/batch_entropy_100bins": 0.9499402094362974, "calibration/batch_entropy_10bins": 0.9508950837242326, "calibration/batch_entropy_50bins": 0.9593933804461509, "calibration/batch_uniqueness": 0.9471473333291687, "calibration/buffer_distribution_entropy": 0.9677614387212963, "calibration/buffer_entropy_100bins": 0.9817337736389484, "calibration/buffer_entropy_10bins": 0.9677614387212963, "calibration/buffer_entropy_50bins": 0.9798977340998449, "calibration/confidence_entropy": 0.445313271491532, "calibration/coverage@0%": 0.07768557054417162, "calibration/coverage@1%": 0.1580005311740929, "calibration/coverage@10%": 0.429632352443371, "calibration/coverage@15%": 0.5252005253547066, "calibration/coverage@20%": 0.5991877003084655, "calibration/coverage@25%": 0.7273865506431193, "calibration/coverage@30%": 0.7858148091855133, "calibration/coverage@5%": 0.3040409427787214, "calibration/distribution_entropy_10": 0.9508950837242326, "calibration/distribution_entropy_100": 0.9499402094362974, "calibration/ece": 0.16646348737058445, "calibration/mean_confidence": 0.531541467788136, "calibration/unique_confidence_per_question": 0.9880208333333333, "calibration/unique_confidences": 379.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010763888888888884, "completions/max_length": 3309.8, "completions/max_terminated_length": 3309.8, "completions/mean_length": 682.9143310546875, "completions/mean_terminated_length": 690.3553955078125, "completions/min_length": 0.0, "completions/min_terminated_length": 176.6, "epoch": 0.3599955000562493, "grad_norm": 0.004152393434196711, "learning_rate": 1.7469879518072292e-06, "loss": -0.0342, "num_tokens": 307927542.0, "reward": 1.0145195960998534, "reward_std": 0.12927872389554979, "rewards/accuracy_reward": 0.6892361044883728, "rewards/batch_coverage_0": 0.3265324205160141, "rewards/brier_reward": 0.8156694173812866, "rewards/confidence_uniqueness_reward": 0.9351176023483276, "rewards/format_reward": 0.9892361164093018, "rewards/frontier_entropy_batch_reward": -0.32448467910289763, "signal/accuracy_reward/centered_abs_mean": 0.1487847238779068, "signal/accuracy_reward/group_std_mean": 0.19472098350524902, "signal/accuracy_reward/group_zero_std_frac": 0.4472222328186035, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0139795064926147, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0743923619389534, "signal/advantage_abs_mean": 0.731607973575592, "signal/advantage_pre_scale_abs_mean": 0.09385995417833329, "signal/advantage_pre_scale_std": 0.1545901983976364, "signal/advantage_std": 0.9830900192260742, "signal/batch_coverage_0/centered_abs_mean": 0.166290482878685, "signal/batch_coverage_0/group_std_mean": 0.21268681883811952, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23350246846675873, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.016629048623144627, "signal/brier_reward/centered_abs_mean": 0.13412976413965225, "signal/brier_reward/group_std_mean": 0.17652139365673064, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18553486466407776, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.013412976451218129, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.03293977566063404, "signal/confidence_uniqueness_reward/group_std_mean": 0.05978764072060585, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04623020067811012, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003293977631255984, "signal/format_reward/centered_abs_mean": 0.01946614570915699, "signal/format_reward/group_std_mean": 0.044223295897245406, "signal/format_reward/group_zero_std_frac": 0.7888888955116272, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.13444773107767105, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.009733072854578495, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33882314562797544, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40830708146095274, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47709122896194456, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03388231247663498, "step": 150 }, { "epoch": 0.3599955000562493, "eval_completions/clipped_ratio": 0.020833333333333353, "eval_completions/max_length": 2312.6666666666665, "eval_completions/max_terminated_length": 2312.6666666666665, "eval_completions/mean_length": 630.3422037760416, "eval_completions/mean_terminated_length": 643.8012288411459, "eval_completions/min_length": 44.666666666666664, "eval_completions/min_terminated_length": 222.83333333333334, "eval_loss": 0.0, "eval_num_tokens": 307927542.0, "eval_reward": 0.9089752435684204, "eval_reward_std": 0.24734783669312796, "eval_rewards/accuracy_reward": 0.6935763955116272, "eval_rewards/batch_coverage_0": 0.027789496661474306, "eval_rewards/brier_reward": 0.8029025296370188, "eval_rewards/confidence_uniqueness_reward": 0.874511311451594, "eval_rewards/format_reward": 0.9791666766007742, "eval_rewards/frontier_entropy_batch_reward": -0.9791666766007742, "eval_runtime": 187.017, "eval_samples_per_second": 5.347, "eval_signal/accuracy_reward/centered_abs_mean": 0.41259765625, "eval_signal/accuracy_reward/group_std_mean": 0.4600823372602463, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8547090888023376, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.206298828125, "eval_signal/advantage_abs_mean": 0.8410639564196268, "eval_signal/advantage_pre_scale_abs_mean": 0.20800520231326422, "eval_signal/advantage_pre_scale_std": 0.2465300957361857, "eval_signal/advantage_std": 0.9864153365294138, "eval_signal/batch_coverage_0/centered_abs_mean": 0.18049288044373193, "eval_signal/batch_coverage_0/group_std_mean": 0.2696962629755338, "eval_signal/batch_coverage_0/group_zero_std_frac": 0.0, "eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07506928406655788, "eval_signal/batch_coverage_0/weight": 0.10000000149011612, "eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.018049288696298998, "eval_signal/brier_reward/centered_abs_mean": 0.200759785870711, "eval_signal/brier_reward/group_std_mean": 0.2616179163257281, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.0827964333196481, "eval_signal/brier_reward/weight": 0.10000000149011612, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.020075978711247444, "eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.06461226319273312, "eval_signal/confidence_uniqueness_reward/group_std_mean": 0.1164653729647398, "eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02636661970367034, "eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006461226458971699, "eval_signal/format_reward/centered_abs_mean": 0.039605035446584225, "eval_signal/format_reward/group_std_mean": 0.09938833986719449, "eval_signal/format_reward/group_zero_std_frac": 0.5000000049670538, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.07894899075229962, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.019802517723292112, "eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.039605035446584225, "eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.09938833986719449, "eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.5000000049670538, "eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.01578979877134164, "eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.00396050369211783, "eval_steps_per_second": 0.032, "step": 150 }, { "epoch": 0.3599955000562493, "step": 150, "train_probe_completions/clipped_ratio": 0.011284722222222229, "train_probe_completions/max_length": 2123.1666666666665, "train_probe_completions/max_terminated_length": 2123.1666666666665, "train_probe_completions/mean_length": 653.1609598795573, "train_probe_completions/mean_terminated_length": 660.668711344401, "train_probe_completions/min_length": 35.833333333333336, "train_probe_completions/min_terminated_length": 181.83333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 307927542.0, "train_probe_reward": 0.9152792493502299, "train_probe_reward_std": 0.23713775972525278, "train_probe_rewards/accuracy_reward": 0.7005208333333334, "train_probe_rewards/batch_coverage_0": 0.02929902666558822, "train_probe_rewards/brier_reward": 0.8055415451526642, "train_probe_rewards/confidence_uniqueness_reward": 0.8778475622336069, "train_probe_rewards/format_reward": 0.984375, "train_probe_rewards/frontier_entropy_batch_reward": -0.984375, "train_probe_runtime": 204.3154, "train_probe_samples_per_second": 4.894, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.4099934895833333, "train_probe_signal/accuracy_reward/group_std_mean": 0.45888521273930866, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8822846511999766, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.20499674479166666, "train_probe_signal/advantage_abs_mean": 0.8449337383111318, "train_probe_signal/advantage_pre_scale_abs_mean": 0.20016596963008246, "train_probe_signal/advantage_pre_scale_std": 0.23646700382232666, "train_probe_signal/advantage_std": 0.9863986372947693, "train_probe_signal/batch_coverage_0/centered_abs_mean": 0.18984342366456985, "train_probe_signal/batch_coverage_0/group_std_mean": 0.27815695852041245, "train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0, "train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.08140804183979829, "train_probe_signal/batch_coverage_0/weight": 0.10000000149011612, "train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.018984342149148386, "train_probe_signal/brier_reward/centered_abs_mean": 0.1932607168952624, "train_probe_signal/brier_reward/group_std_mean": 0.25661252935727435, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.08301879713932674, "train_probe_signal/brier_reward/weight": 0.10000000149011612, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.01932607094446818, "train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.06211579963564873, "train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.10437597334384918, "train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.026644666989644367, "train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.006211580087741216, "train_probe_signal/format_reward/centered_abs_mean": 0.02983940951526165, "train_probe_signal/format_reward/group_std_mean": 0.07643071375787258, "train_probe_signal/format_reward/group_zero_std_frac": 0.6111111293236414, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.06301099962244432, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.014919704757630825, "train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.02983940951526165, "train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.07643071375787258, "train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.6111111293236414, "train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.012602200809245309, "train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0029839409980922937, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.13047708267607877, "calibration/batch_distribution_entropy": 0.9780475321298369, "calibration/batch_entropy_100bins": 0.9625704525202989, "calibration/batch_entropy_10bins": 0.9780475321298369, "calibration/batch_entropy_50bins": 0.9725882234384929, "calibration/batch_uniqueness": 0.9517862803241908, "calibration/buffer_distribution_entropy": 0.9735897148741166, "calibration/buffer_entropy_100bins": 0.9859773805939941, "calibration/buffer_entropy_10bins": 0.9735897148741166, "calibration/buffer_entropy_50bins": 0.9837481165719518, "calibration/confidence_entropy": 0.48740500431683287, "calibration/coverage@0%": 0.04358208749052296, "calibration/coverage@1%": 0.09358208749052296, "calibration/coverage@10%": 0.4876644552477252, "calibration/coverage@15%": 0.5870458527081308, "calibration/coverage@20%": 0.7345223454503469, "calibration/coverage@25%": 0.8802459049707746, "calibration/coverage@30%": 0.9559009361150557, "calibration/coverage@5%": 0.3262937880016433, "calibration/distribution_entropy_10": 0.9780475321298369, "calibration/distribution_entropy_100": 0.9625704525202989, "calibration/ece": 0.21096636475852878, "calibration/mean_confidence": 0.5727649671475417, "calibration/unique_confidence_per_question": 0.9890625, "calibration/unique_confidences": 379.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.019357638888888883, "completions/max_length": 3017.6, "completions/max_terminated_length": 3017.6, "completions/mean_length": 637.5643188476563, "completions/mean_terminated_length": 650.1926635742187, "completions/min_length": 0.0, "completions/min_terminated_length": 190.6, "epoch": 0.3719953500581243, "grad_norm": 0.0035331202670931816, "learning_rate": 1.5963855421686747e-06, "loss": -0.0546, "num_tokens": 318379995.0, "reward": 1.0356850147247314, "reward_std": 0.14273865818977355, "rewards/accuracy_reward": 0.7408854126930237, "rewards/batch_coverage_0": 0.3165908634662628, "rewards/brier_reward": 0.8121230244636536, "rewards/confidence_uniqueness_reward": 0.9271683931350708, "rewards/format_reward": 0.9801215171813965, "rewards/frontier_entropy_batch_reward": -0.3040671467781067, "signal/accuracy_reward/centered_abs_mean": 0.15713433027267457, "signal/accuracy_reward/group_std_mean": 0.2059522569179535, "signal/accuracy_reward/group_zero_std_frac": 0.42222222685813904, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0711824655532838, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07856716513633728, "signal/advantage_abs_mean": 0.7309226036071778, "signal/advantage_pre_scale_abs_mean": 0.10310240536928177, "signal/advantage_pre_scale_std": 0.17670319974422455, "signal/advantage_std": 0.9831022620201111, "signal/batch_coverage_0/centered_abs_mean": 0.16185049712657928, "signal/batch_coverage_0/group_std_mean": 0.20479914247989656, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2245582729578018, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.016185050085186958, "signal/brier_reward/centered_abs_mean": 0.13712520897388458, "signal/brier_reward/group_std_mean": 0.17701683342456817, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18834921419620515, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.013712521269917488, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.04509064331650734, "signal/confidence_uniqueness_reward/group_std_mean": 0.07520343959331513, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.06165208369493484, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0045090644154697655, "signal/format_reward/centered_abs_mean": 0.03320855014026165, "signal/format_reward/group_std_mean": 0.061374055594205855, "signal/format_reward/group_zero_std_frac": 0.7527777910232544, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.22536550164222718, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.016604275070130826, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33104270696640015, "signal/frontier_entropy_batch_reward/group_std_mean": 0.39966055750846863, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4580074489116669, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033104270696640015, "step": 155 }, { "calibration/aurc": 0.0984045730686661, "calibration/batch_distribution_entropy": 0.8756279425815006, "calibration/batch_entropy_100bins": 0.9106554886391262, "calibration/batch_entropy_10bins": 0.8756279425815006, "calibration/batch_entropy_50bins": 0.9137960166292582, "calibration/batch_uniqueness": 0.9327047283052717, "calibration/buffer_distribution_entropy": 0.9735163551188186, "calibration/buffer_entropy_100bins": 0.9861154699025338, "calibration/buffer_entropy_10bins": 0.9735163551188186, "calibration/buffer_entropy_50bins": 0.9837800172609528, "calibration/confidence_entropy": 0.444702059210076, "calibration/coverage@0%": 0.1835072163523095, "calibration/coverage@1%": 0.25085980835864113, "calibration/coverage@10%": 0.6750243154773228, "calibration/coverage@15%": 0.754269822612892, "calibration/coverage@20%": 0.8678158253356143, "calibration/coverage@25%": 0.9021978021978022, "calibration/coverage@30%": 0.9181318681318682, "calibration/coverage@5%": 0.5333837038338781, "calibration/distribution_entropy_10": 0.8756279425815006, "calibration/distribution_entropy_100": 0.9106554886391262, "calibration/ece": 0.11786937630319141, "calibration/mean_confidence": 0.6930237447381653, "calibration/unique_confidence_per_question": 0.9671875, "calibration/unique_confidences": 371.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036805555555555536, "completions/max_length": 2880.4, "completions/max_terminated_length": 2880.4, "completions/mean_length": 646.36328125, "completions/mean_terminated_length": 671.0564208984375, "completions/min_length": 0.0, "completions/min_terminated_length": 190.6, "epoch": 0.38399520005999926, "grad_norm": 0.0026809952687472105, "learning_rate": 1.4457831325301204e-06, "loss": -0.1022, "num_tokens": 328913396.0, "reward": 0.9842295050621033, "reward_std": 0.16013213694095613, "rewards/accuracy_reward": 0.6764756917953492, "rewards/batch_coverage_0": 0.3238168299198151, "rewards/brier_reward": 0.7961023449897766, "rewards/confidence_uniqueness_reward": 0.9045828580856323, "rewards/format_reward": 0.9625000119209289, "rewards/frontier_entropy_batch_reward": -0.37708542943000795, "signal/accuracy_reward/centered_abs_mean": 0.14662000834941863, "signal/accuracy_reward/group_std_mean": 0.19255328476428984, "signal/accuracy_reward/group_zero_std_frac": 0.4527777791023254, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0311393976211547, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07331000417470931, "signal/advantage_abs_mean": 0.7269745111465454, "signal/advantage_pre_scale_abs_mean": 0.11633765548467637, "signal/advantage_pre_scale_std": 0.19983015954494476, "signal/advantage_std": 0.9830737590789795, "signal/batch_coverage_0/centered_abs_mean": 0.15227404236793518, "signal/batch_coverage_0/group_std_mean": 0.1953058809041977, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21439152359962463, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015227404795587063, "signal/brier_reward/centered_abs_mean": 0.1461849495768547, "signal/brier_reward/group_std_mean": 0.1892971932888031, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.20605530440807343, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.014618495106697082, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.07116645872592926, "signal/confidence_uniqueness_reward/group_std_mean": 0.10824198424816131, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.1000448852777481, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.007116645760834217, "signal/format_reward/centered_abs_mean": 0.05997178852558136, "signal/format_reward/group_std_mean": 0.09567098915576935, "signal/format_reward/group_zero_std_frac": 0.6666666626930237, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.4201627790927887, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.02998589426279068, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33648825287818906, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40563682913780214, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47566083669662473, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03364882543683052, "step": 160 }, { "calibration/aurc": 0.1314698647681308, "calibration/batch_distribution_entropy": 0.9539865491003766, "calibration/batch_entropy_100bins": 0.9465729070156297, "calibration/batch_entropy_10bins": 0.9539865491003766, "calibration/batch_entropy_50bins": 0.9570973253069107, "calibration/batch_uniqueness": 0.9471188639087285, "calibration/buffer_distribution_entropy": 0.9719415474139419, "calibration/buffer_entropy_100bins": 0.9853135667011523, "calibration/buffer_entropy_10bins": 0.9719415474139419, "calibration/buffer_entropy_50bins": 0.9828414865175944, "calibration/confidence_entropy": 0.4629173760167061, "calibration/coverage@0%": 0.055678425540073115, "calibration/coverage@1%": 0.055678425540073115, "calibration/coverage@10%": 0.5788827150747762, "calibration/coverage@15%": 0.6855880312968711, "calibration/coverage@20%": 0.7366116007187298, "calibration/coverage@25%": 0.7927668933284889, "calibration/coverage@30%": 0.909690593071274, "calibration/coverage@5%": 0.35066449037826136, "calibration/distribution_entropy_10": 0.9539865491003766, "calibration/distribution_entropy_100": 0.9465729070156297, "calibration/ece": 0.20537345733881648, "calibration/mean_confidence": 0.5303268611443963, "calibration/unique_confidence_per_question": 0.9776041666666666, "calibration/unique_confidences": 375.4, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022135416666666675, "completions/max_length": 3410.4, "completions/max_terminated_length": 3410.4, "completions/mean_length": 720.5960205078125, "completions/mean_terminated_length": 736.8052001953125, "completions/min_length": 0.0, "completions/min_terminated_length": 229.2, "epoch": 0.39599505006187424, "grad_norm": 0.0021246892865747213, "learning_rate": 1.2951807228915664e-06, "loss": -0.0594, "num_tokens": 340353734.0, "reward": 1.0025168299674987, "reward_std": 0.13918909132480622, "rewards/accuracy_reward": 0.6728298664093018, "rewards/batch_coverage_0": 0.3095873832702637, "rewards/brier_reward": 0.7944852590560914, "rewards/confidence_uniqueness_reward": 0.929128909111023, "rewards/format_reward": 0.9777777791023254, "rewards/frontier_entropy_batch_reward": -0.2610716551542282, "signal/accuracy_reward/centered_abs_mean": 0.1396755650639534, "signal/accuracy_reward/group_std_mean": 0.18935762345790863, "signal/accuracy_reward/group_zero_std_frac": 0.43888888955116273, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8814050197601319, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0698377825319767, "signal/advantage_abs_mean": 0.7221243023872376, "signal/advantage_pre_scale_abs_mean": 0.09935600310564041, "signal/advantage_pre_scale_std": 0.16630815267562865, "signal/advantage_std": 0.9832156300544739, "signal/batch_coverage_0/centered_abs_mean": 0.1702214241027832, "signal/batch_coverage_0/group_std_mean": 0.21486833691596985, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2145317405462265, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.017022141814231874, "signal/brier_reward/centered_abs_mean": 0.13762031197547914, "signal/brier_reward/group_std_mean": 0.18010320365428925, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17319863736629487, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.013762031495571137, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.045266738161444664, "signal/confidence_uniqueness_reward/group_std_mean": 0.07511035352945328, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.056387230008840564, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004526673769578338, "signal/format_reward/centered_abs_mean": 0.03636067621409893, "signal/format_reward/group_std_mean": 0.06495743244886398, "signal/format_reward/group_zero_std_frac": 0.7500000119209289, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.22450997531414033, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.018180338107049464, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3157182991504669, "signal/frontier_entropy_batch_reward/group_std_mean": 0.38673295378684996, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4012265205383301, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03157182894647122, "step": 165 }, { "calibration/aurc": 0.11538836511334989, "calibration/batch_distribution_entropy": 0.925228953185431, "calibration/batch_entropy_100bins": 0.9315305819400574, "calibration/batch_entropy_10bins": 0.925228953185431, "calibration/batch_entropy_50bins": 0.9407669643923352, "calibration/batch_uniqueness": 0.9407560578101188, "calibration/buffer_distribution_entropy": 0.9727285409627902, "calibration/buffer_entropy_100bins": 0.9857416673026582, "calibration/buffer_entropy_10bins": 0.9727285409627902, "calibration/buffer_entropy_50bins": 0.983333567489086, "calibration/confidence_entropy": 0.4530259297587672, "calibration/coverage@0%": 0.090934427705629, "calibration/coverage@1%": 0.13260109437229567, "calibration/coverage@10%": 0.5461385783004304, "calibration/coverage@15%": 0.7319897814544014, "calibration/coverage@20%": 0.8508060829526263, "calibration/coverage@25%": 0.9216212225055326, "calibration/coverage@30%": 0.9787373503445759, "calibration/coverage@5%": 0.2340060737792186, "calibration/distribution_entropy_10": 0.925228953185431, "calibration/distribution_entropy_100": 0.9315305819400574, "calibration/ece": 0.10519794172538484, "calibration/mean_confidence": 0.6474902234638621, "calibration/unique_confidence_per_question": 0.9885416666666667, "calibration/unique_confidences": 379.6, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008680555555555535, "completions/max_length": 3761.0, "completions/max_terminated_length": 3761.0, "completions/mean_length": 727.771533203125, "completions/mean_terminated_length": 734.15712890625, "completions/min_length": 0.0, "completions/min_terminated_length": 224.6, "epoch": 0.4079949000637492, "grad_norm": 0.00241382815875113, "learning_rate": 1.1445783132530121e-06, "loss": -0.0275, "num_tokens": 351826846.0, "reward": 1.032625389099121, "reward_std": 0.12265074402093887, "rewards/accuracy_reward": 0.7263020753860474, "rewards/batch_coverage_0": 0.33543471097946165, "rewards/brier_reward": 0.8319101929664612, "rewards/confidence_uniqueness_reward": 0.9344179272651673, "rewards/format_reward": 0.9912326335906982, "rewards/frontier_entropy_batch_reward": -0.3631828844547272, "signal/accuracy_reward/centered_abs_mean": 0.13776584416627885, "signal/accuracy_reward/group_std_mean": 0.1854351818561554, "signal/accuracy_reward/group_zero_std_frac": 0.4638888955116272, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0262118101119995, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06888292208313943, "signal/advantage_abs_mean": 0.735342538356781, "signal/advantage_pre_scale_abs_mean": 0.08866324722766876, "signal/advantage_pre_scale_std": 0.14827727675437927, "signal/advantage_std": 0.9829926133155823, "signal/batch_coverage_0/centered_abs_mean": 0.15526262521743775, "signal/batch_coverage_0/group_std_mean": 0.19668332040309905, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23164966702461243, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015526262111961842, "signal/brier_reward/centered_abs_mean": 0.12140081971883773, "signal/brier_reward/group_std_mean": 0.16084674894809722, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18126706182956695, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.012140082381665706, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.03103194199502468, "signal/confidence_uniqueness_reward/group_std_mean": 0.05045064315199852, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04684874601662159, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003103194246068597, "signal/format_reward/centered_abs_mean": 0.01565212644636631, "signal/format_reward/group_std_mean": 0.03198937810957432, "signal/format_reward/group_zero_std_frac": 0.8583333253860473, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.11856901496648789, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007826063223183155, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3486171066761017, "signal/frontier_entropy_batch_reward/group_std_mean": 0.41613501906394956, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5219741106033325, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.034861710667610166, "step": 170 }, { "calibration/aurc": 0.08714649584563566, "calibration/batch_distribution_entropy": 0.9582676263767113, "calibration/batch_entropy_100bins": 0.9497775003333068, "calibration/batch_entropy_10bins": 0.9582676263767113, "calibration/batch_entropy_50bins": 0.9611324354823612, "calibration/batch_uniqueness": 0.9488688537919441, "calibration/buffer_distribution_entropy": 0.9723564713598757, "calibration/buffer_entropy_100bins": 0.9855557420167175, "calibration/buffer_entropy_10bins": 0.9723564713598757, "calibration/buffer_entropy_50bins": 0.9831130728598103, "calibration/confidence_entropy": 0.46930408368645865, "calibration/coverage@0%": 0.1124440803878803, "calibration/coverage@1%": 0.1946861438799438, "calibration/coverage@10%": 0.6197900524033957, "calibration/coverage@15%": 0.7821157887456747, "calibration/coverage@20%": 0.9116034973982613, "calibration/coverage@25%": 0.9745239509945393, "calibration/coverage@30%": 1.0, "calibration/coverage@5%": 0.40132178443876604, "calibration/distribution_entropy_10": 0.9582676263767113, "calibration/distribution_entropy_100": 0.9497775003333068, "calibration/ece": 0.20635704384932518, "calibration/mean_confidence": 0.5645161705073962, "calibration/unique_confidence_per_question": 0.9864583333333334, "calibration/unique_confidences": 378.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013454861111111115, "completions/max_length": 3692.8, "completions/max_terminated_length": 3692.8, "completions/mean_length": 756.1865478515625, "completions/mean_terminated_length": 766.48515625, "completions/min_length": 0.0, "completions/min_terminated_length": 218.4, "epoch": 0.4199947500656242, "grad_norm": 0.0023003392852842808, "learning_rate": 9.93975903614458e-07, "loss": -0.0367, "num_tokens": 363646083.0, "reward": 1.0302478551864624, "reward_std": 0.12525633573532105, "rewards/accuracy_reward": 0.7182291746139526, "rewards/batch_coverage_0": 0.32915496826171875, "rewards/brier_reward": 0.8144567489624024, "rewards/confidence_uniqueness_reward": 0.934039568901062, "rewards/format_reward": 0.9863715171813965, "rewards/frontier_entropy_batch_reward": -0.2981763184070587, "signal/accuracy_reward/centered_abs_mean": 0.14651692509651185, "signal/accuracy_reward/group_std_mean": 0.19464356005191802, "signal/accuracy_reward/group_zero_std_frac": 0.4416666626930237, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.059949767589569, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07325846254825592, "signal/advantage_abs_mean": 0.7322659015655517, "signal/advantage_pre_scale_abs_mean": 0.09160526245832443, "signal/advantage_pre_scale_std": 0.1541106790304184, "signal/advantage_std": 0.9830336213111878, "signal/batch_coverage_0/centered_abs_mean": 0.17044160962104798, "signal/batch_coverage_0/group_std_mean": 0.21640016436576842, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.2487411081790924, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.017044160701334476, "signal/brier_reward/centered_abs_mean": 0.13068657219409943, "signal/brier_reward/group_std_mean": 0.16981444656848907, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18954426646232606, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.013068657368421555, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.03548645004630089, "signal/confidence_uniqueness_reward/group_std_mean": 0.055370701104402544, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.051629979908466336, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0035486449021846058, "signal/format_reward/centered_abs_mean": 0.02208658866584301, "signal/format_reward/group_std_mean": 0.03925186991691589, "signal/format_reward/group_zero_std_frac": 0.8444444417953492, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.16051022261381148, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.011043294332921505, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32774901390075684, "signal/frontier_entropy_batch_reward/group_std_mean": 0.3981485068798065, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.47704883217811583, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03277490176260471, "step": 175 }, { "calibration/aurc": 0.08240090854620943, "calibration/batch_distribution_entropy": 0.9407075408953339, "calibration/batch_entropy_100bins": 0.9397327456537774, "calibration/batch_entropy_10bins": 0.9407075408953339, "calibration/batch_entropy_50bins": 0.9475896407362339, "calibration/batch_uniqueness": 0.9445637852645128, "calibration/buffer_distribution_entropy": 0.9721319964350418, "calibration/buffer_entropy_100bins": 0.985457787774781, "calibration/buffer_entropy_10bins": 0.9721319964350418, "calibration/buffer_entropy_50bins": 0.9829938347745161, "calibration/confidence_entropy": 0.46792723665558106, "calibration/coverage@0%": 0.08522190425453813, "calibration/coverage@1%": 0.12364295688611708, "calibration/coverage@10%": 0.7188442015658267, "calibration/coverage@15%": 0.8503315215324696, "calibration/coverage@20%": 0.9222259642094972, "calibration/coverage@25%": 0.9571244560487381, "calibration/coverage@30%": 0.981201044386423, "calibration/coverage@5%": 0.4305176868635339, "calibration/distribution_entropy_10": 0.9407075408953339, "calibration/distribution_entropy_100": 0.9397327456537774, "calibration/ece": 0.156930699984235, "calibration/mean_confidence": 0.6234170294374884, "calibration/unique_confidence_per_question": 0.9822916666666668, "calibration/unique_confidences": 377.2, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012413194444444465, "completions/max_length": 3749.6, "completions/max_terminated_length": 3749.6, "completions/mean_length": 743.4819458007812, "completions/mean_terminated_length": 752.8460205078125, "completions/min_length": 0.0, "completions/min_terminated_length": 243.2, "epoch": 0.4319946000674992, "grad_norm": 0.00234069861471653, "learning_rate": 8.433734939759036e-07, "loss": -0.0346, "num_tokens": 375310963.0, "reward": 1.0280283689498901, "reward_std": 0.12910850942134858, "rewards/accuracy_reward": 0.7190972208976746, "rewards/batch_coverage_0": 0.31788222789764403, "rewards/brier_reward": 0.8127464532852173, "rewards/confidence_uniqueness_reward": 0.9341015100479126, "rewards/format_reward": 0.9874131917953491, "rewards/frontier_entropy_batch_reward": -0.3169987857341766, "signal/accuracy_reward/centered_abs_mean": 0.14658203125, "signal/accuracy_reward/group_std_mean": 0.19198833405971527, "signal/accuracy_reward/group_zero_std_frac": 0.4583333432674408, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0368584036827087, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.073291015625, "signal/advantage_abs_mean": 0.7443090558052063, "signal/advantage_pre_scale_abs_mean": 0.09514848291873931, "signal/advantage_pre_scale_std": 0.15735865235328675, "signal/advantage_std": 0.9830684781074523, "signal/batch_coverage_0/centered_abs_mean": 0.15284710228443146, "signal/batch_coverage_0/group_std_mean": 0.19452511370182038, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21728379726409913, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015284710563719273, "signal/brier_reward/centered_abs_mean": 0.12713438272476196, "signal/brier_reward/group_std_mean": 0.16676026284694673, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18021451830863952, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.012713438645005227, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.03413993827998638, "signal/confidence_uniqueness_reward/group_std_mean": 0.056762049347162245, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04856302812695503, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.003413993865251541, "signal/format_reward/centered_abs_mean": 0.02097981758415699, "signal/format_reward/group_std_mean": 0.04119304716587067, "signal/format_reward/group_zero_std_frac": 0.8222222328186035, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.14933316856622697, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010489908792078496, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3364332139492035, "signal/frontier_entropy_batch_reward/group_std_mean": 0.4029194176197052, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4776536524295807, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.033643323183059695, "step": 180 }, { "calibration/aurc": 0.17622694813717843, "calibration/batch_distribution_entropy": 0.9483525734268292, "calibration/batch_entropy_100bins": 0.9437878593028112, "calibration/batch_entropy_10bins": 0.9483525734268292, "calibration/batch_entropy_50bins": 0.9537535291394281, "calibration/batch_uniqueness": 0.9465086091799648, "calibration/buffer_distribution_entropy": 0.9712713337492183, "calibration/buffer_entropy_100bins": 0.9850464005867252, "calibration/buffer_entropy_10bins": 0.9712713337492183, "calibration/buffer_entropy_50bins": 0.9825057272465818, "calibration/confidence_entropy": 0.48850146133497957, "calibration/coverage@0%": 0.041510637632665943, "calibration/coverage@1%": 0.041510637632665943, "calibration/coverage@10%": 0.22907567729884618, "calibration/coverage@15%": 0.42216374269005846, "calibration/coverage@20%": 0.6706528723898518, "calibration/coverage@25%": 0.8674514833646343, "calibration/coverage@30%": 0.9366991892263297, "calibration/coverage@5%": 0.06203695342213962, "calibration/distribution_entropy_10": 0.9483525734268292, "calibration/distribution_entropy_100": 0.9437878593028112, "calibration/ece": 0.18354075972457048, "calibration/mean_confidence": 0.5970391973608894, "calibration/unique_confidence_per_question": 0.9916666666666668, "calibration/unique_confidences": 380.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010677083333333327, "completions/max_length": 3388.4, "completions/max_terminated_length": 3388.4, "completions/mean_length": 738.7443725585938, "completions/mean_terminated_length": 746.72666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 218.8, "epoch": 0.44399445006937416, "grad_norm": 0.0022149120923131704, "learning_rate": 6.927710843373495e-07, "loss": -0.0301, "num_tokens": 386911346.0, "reward": 1.0222719669342042, "reward_std": 0.12421630620956421, "rewards/accuracy_reward": 0.6988715291023254, "rewards/batch_coverage_0": 0.313459575176239, "rewards/brier_reward": 0.8119510054588318, "rewards/confidence_uniqueness_reward": 0.9384174823760987, "rewards/format_reward": 0.9892361164093018, "rewards/frontier_entropy_batch_reward": -0.28164616525173186, "signal/accuracy_reward/centered_abs_mean": 0.1408148854970932, "signal/accuracy_reward/group_std_mean": 0.18646234571933745, "signal/accuracy_reward/group_zero_std_frac": 0.46666666865348816, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9637740254402161, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.0704074427485466, "signal/advantage_abs_mean": 0.7450667142868042, "signal/advantage_pre_scale_abs_mean": 0.09193661212921142, "signal/advantage_pre_scale_std": 0.14988455176353455, "signal/advantage_std": 0.9831077575683593, "signal/batch_coverage_0/centered_abs_mean": 0.15791791081428527, "signal/batch_coverage_0/group_std_mean": 0.1993875563144684, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21790952682495118, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.015791791677474975, "signal/brier_reward/centered_abs_mean": 0.1271849974989891, "signal/brier_reward/group_std_mean": 0.16353001594543456, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17506942749023438, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.012718500196933746, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.029871857166290282, "signal/confidence_uniqueness_reward/group_std_mean": 0.049267768114805224, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04109632298350334, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0029871856328099965, "signal/format_reward/centered_abs_mean": 0.01829427033662796, "signal/format_reward/group_std_mean": 0.035397492721676825, "signal/format_reward/group_zero_std_frac": 0.85, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.12542281299829483, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.00914713516831398, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3175092935562134, "signal/frontier_entropy_batch_reward/group_std_mean": 0.38766228556633, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.43858484625816346, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0317509301006794, "step": 185 }, { "calibration/aurc": 0.12269664298524821, "calibration/batch_distribution_entropy": 0.9097176166639136, "calibration/batch_entropy_100bins": 0.9256166673164021, "calibration/batch_entropy_10bins": 0.9097176166639136, "calibration/batch_entropy_50bins": 0.9305331134066039, "calibration/batch_uniqueness": 0.9398041886522405, "calibration/buffer_distribution_entropy": 0.97077519455814, "calibration/buffer_entropy_100bins": 0.9848005927504747, "calibration/buffer_entropy_10bins": 0.97077519455814, "calibration/buffer_entropy_50bins": 0.9822313921481187, "calibration/confidence_entropy": 0.4558133308321491, "calibration/coverage@0%": 0.03350573937121969, "calibration/coverage@1%": 0.08599917769142966, "calibration/coverage@10%": 0.499440615097099, "calibration/coverage@15%": 0.5598557534158825, "calibration/coverage@20%": 0.882024124561112, "calibration/coverage@25%": 0.9567708333333332, "calibration/coverage@30%": 0.9791666666666667, "calibration/coverage@5%": 0.420323940035368, "calibration/distribution_entropy_10": 0.9097176166639136, "calibration/distribution_entropy_100": 0.9256166673164021, "calibration/ece": 0.14582941996305046, "calibration/mean_confidence": 0.6561318203686277, "calibration/unique_confidence_per_question": 0.9953125, "calibration/unique_confidences": 382.2, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006510416666666674, "completions/max_length": 3278.8, "completions/max_terminated_length": 3278.8, "completions/mean_length": 734.9641479492187, "completions/mean_terminated_length": 739.8077758789062, "completions/min_length": 0.0, "completions/min_terminated_length": 231.0, "epoch": 0.45599430007124914, "grad_norm": 0.002926592016592622, "learning_rate": 5.421686746987952e-07, "loss": -0.0133, "num_tokens": 398461077.0, "reward": 1.0405804634094238, "reward_std": 0.11852861940860748, "rewards/accuracy_reward": 0.7360243082046509, "rewards/batch_coverage_0": 0.3142602384090424, "rewards/brier_reward": 0.8369073390960693, "rewards/confidence_uniqueness_reward": 0.9390465021133423, "rewards/format_reward": 0.9933159708976745, "rewards/frontier_entropy_batch_reward": -0.3311110734939575, "signal/accuracy_reward/centered_abs_mean": 0.13892686367034912, "signal/accuracy_reward/group_std_mean": 0.18981125354766845, "signal/accuracy_reward/group_zero_std_frac": 0.4222222208976746, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0067436337471007, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06946343183517456, "signal/advantage_abs_mean": 0.727643609046936, "signal/advantage_pre_scale_abs_mean": 0.08646512627601624, "signal/advantage_pre_scale_std": 0.14237033128738402, "signal/advantage_std": 0.9830322980880737, "signal/batch_coverage_0/centered_abs_mean": 0.13719195425510405, "signal/batch_coverage_0/group_std_mean": 0.17514330744743348, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.19944521486759187, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.013719195872545243, "signal/brier_reward/centered_abs_mean": 0.11094630658626556, "signal/brier_reward/group_std_mean": 0.1479420006275177, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.16077562868595124, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.011094630509614945, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.026065148785710333, "signal/confidence_uniqueness_reward/group_std_mean": 0.04158492237329483, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.03798387497663498, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002606514934450388, "signal/format_reward/centered_abs_mean": 0.011876085167750716, "signal/format_reward/group_std_mean": 0.024491559341549875, "signal/format_reward/group_zero_std_frac": 0.8916666746139527, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.08604965135455131, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.005938042583875358, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3279430866241455, "signal/frontier_entropy_batch_reward/group_std_mean": 0.3947389841079712, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4778772532939911, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03279430828988552, "step": 190 }, { "calibration/aurc": 0.13273813775825055, "calibration/batch_distribution_entropy": 0.9500027347292127, "calibration/batch_entropy_100bins": 0.9452918277091291, "calibration/batch_entropy_10bins": 0.9500027347292127, "calibration/batch_entropy_50bins": 0.9573763993422191, "calibration/batch_uniqueness": 0.9468649596642662, "calibration/buffer_distribution_entropy": 0.9689344574900707, "calibration/buffer_entropy_100bins": 0.9838738701234462, "calibration/buffer_entropy_10bins": 0.9689344574900707, "calibration/buffer_entropy_50bins": 0.9811441658406614, "calibration/confidence_entropy": 0.47158294896673614, "calibration/coverage@0%": 0.04583078782115089, "calibration/coverage@1%": 0.04583078782115089, "calibration/coverage@10%": 0.39430990423665824, "calibration/coverage@15%": 0.638239374114255, "calibration/coverage@20%": 0.8346849527195428, "calibration/coverage@25%": 0.9250275069637883, "calibration/coverage@30%": 0.9679628597957288, "calibration/coverage@5%": 0.21747004220711577, "calibration/distribution_entropy_10": 0.9500027347292127, "calibration/distribution_entropy_100": 0.9452918277091291, "calibration/ece": 0.1880414980348, "calibration/mean_confidence": 0.5949393429926623, "calibration/unique_confidence_per_question": 0.9682291666666666, "calibration/unique_confidences": 371.8, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01440972222222221, "completions/max_length": 3499.8, "completions/max_terminated_length": 3499.8, "completions/mean_length": 747.442822265625, "completions/mean_terminated_length": 758.5841552734375, "completions/min_length": 0.0, "completions/min_terminated_length": 200.0, "epoch": 0.46799415007312406, "grad_norm": 0.0021557214204221964, "learning_rate": 3.91566265060241e-07, "loss": -0.0423, "num_tokens": 410152482.0, "reward": 1.0175115585327148, "reward_std": 0.1258660525083542, "rewards/accuracy_reward": 0.6973090291023254, "rewards/batch_coverage_0": 0.33180397748947144, "rewards/brier_reward": 0.8193559408187866, "rewards/confidence_uniqueness_reward": 0.9310436129570008, "rewards/format_reward": 0.9855902791023254, "rewards/frontier_entropy_batch_reward": -0.3215842306613922, "signal/accuracy_reward/centered_abs_mean": 0.13668077439069748, "signal/accuracy_reward/group_std_mean": 0.18262230157852172, "signal/accuracy_reward/group_zero_std_frac": 0.4750000059604645, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.977854585647583, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06834038719534874, "signal/advantage_abs_mean": 0.7343301057815552, "signal/advantage_pre_scale_abs_mean": 0.09162229895591736, "signal/advantage_pre_scale_std": 0.15324749201536178, "signal/advantage_std": 0.9830029845237732, "signal/batch_coverage_0/centered_abs_mean": 0.14713969677686692, "signal/batch_coverage_0/group_std_mean": 0.18895590007305146, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21866745948791505, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.014713969826698304, "signal/brier_reward/centered_abs_mean": 0.12192474007606506, "signal/brier_reward/group_std_mean": 0.1624041885137558, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.17810553312301636, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.012192474491894246, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.03553067147731781, "signal/confidence_uniqueness_reward/group_std_mean": 0.057105415314435956, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.052296724170446396, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0035530673805624245, "signal/format_reward/centered_abs_mean": 0.02194010429084301, "signal/format_reward/group_std_mean": 0.04070322811603546, "signal/format_reward/group_zero_std_frac": 0.8361111044883728, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.15765180736780166, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.010970052145421506, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.32328269481658933, "signal/frontier_entropy_batch_reward/group_std_mean": 0.39172765612602234, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4804633140563965, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03232826963067055, "step": 195 }, { "calibration/aurc": 0.13867021776965863, "calibration/batch_distribution_entropy": 0.9353270684263691, "calibration/batch_entropy_100bins": 0.9368945411224996, "calibration/batch_entropy_10bins": 0.9353270684263691, "calibration/batch_entropy_50bins": 0.9452629006479147, "calibration/batch_uniqueness": 0.9436675647789519, "calibration/buffer_distribution_entropy": 0.969523410710188, "calibration/buffer_entropy_100bins": 0.9841768835974831, "calibration/buffer_entropy_10bins": 0.969523410710188, "calibration/buffer_entropy_50bins": 0.9815198335463741, "calibration/confidence_entropy": 0.48172871548592217, "calibration/coverage@0%": 0.04717562944155479, "calibration/coverage@1%": 0.04717562944155479, "calibration/coverage@10%": 0.4994783742967181, "calibration/coverage@15%": 0.6450320866998349, "calibration/coverage@20%": 0.7019051892950392, "calibration/coverage@25%": 0.9165333986074848, "calibration/coverage@30%": 0.9618798955613578, "calibration/coverage@5%": 0.20501936380830985, "calibration/distribution_entropy_10": 0.9353270684263691, "calibration/distribution_entropy_100": 0.9368945411224996, "calibration/ece": 0.1467593867620299, "calibration/mean_confidence": 0.6385691392790117, "calibration/unique_confidence_per_question": 0.9979166666666668, "calibration/unique_confidences": 383.2, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007725694444444442, "completions/max_length": 3300.4, "completions/max_terminated_length": 3300.4, "completions/mean_length": 737.7262329101562, "completions/mean_terminated_length": 743.5352416992188, "completions/min_length": 0.0, "completions/min_terminated_length": 241.2, "epoch": 0.47999400007499904, "grad_norm": 0.004845115356147289, "learning_rate": 2.409638554216868e-07, "loss": -0.0184, "num_tokens": 421718896.0, "reward": 1.03243727684021, "reward_std": 0.11533114612102509, "rewards/accuracy_reward": 0.7151041626930237, "rewards/batch_coverage_0": 0.35005253553390503, "rewards/brier_reward": 0.8234564542770386, "rewards/confidence_uniqueness_reward": 0.9383493304252625, "rewards/format_reward": 0.9921874880790711, "rewards/frontier_entropy_batch_reward": -0.323943692445755, "signal/accuracy_reward/centered_abs_mean": 0.13002387136220933, "signal/accuracy_reward/group_std_mean": 0.17043959200382233, "signal/accuracy_reward/group_zero_std_frac": 0.5166666746139527, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9897404432296752, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06501193568110467, "signal/advantage_abs_mean": 0.7570445895195007, "signal/advantage_pre_scale_abs_mean": 0.08690356314182282, "signal/advantage_pre_scale_std": 0.14214968383312226, "signal/advantage_std": 0.9829631567001342, "signal/batch_coverage_0/centered_abs_mean": 0.146352881193161, "signal/batch_coverage_0/group_std_mean": 0.18568513095378875, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.22308144569396973, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.014635288156569004, "signal/brier_reward/centered_abs_mean": 0.11911999583244323, "signal/brier_reward/group_std_mean": 0.15573802292346955, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18171513974666595, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.011911999993026256, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.02734166830778122, "signal/confidence_uniqueness_reward/group_std_mean": 0.042832625657320024, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.04162702783942222, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0027341668028384447, "signal/format_reward/centered_abs_mean": 0.012923177052289247, "signal/format_reward/group_std_mean": 0.02554241828620434, "signal/format_reward/group_zero_std_frac": 0.8888889074325561, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.0979671061038971, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006461588526144623, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3360644280910492, "signal/frontier_entropy_batch_reward/group_std_mean": 0.40355847477912904, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.5128159046173095, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03360644280910492, "step": 200 }, { "epoch": 0.47999400007499904, "eval_completions/clipped_ratio": 0.006076388888888895, "eval_completions/max_length": 2380.5, "eval_completions/max_terminated_length": 2380.5, "eval_completions/mean_length": 742.8623555501302, "eval_completions/mean_terminated_length": 747.4195251464844, "eval_completions/min_length": 83.0, "eval_completions/min_terminated_length": 232.83333333333334, "eval_loss": 0.0, "eval_num_tokens": 421718896.0, "eval_reward": 0.9218910336494446, "eval_reward_std": 0.22422286123037338, "eval_rewards/accuracy_reward": 0.6953125, "eval_rewards/batch_coverage_0": 0.05044874486823877, "eval_rewards/brier_reward": 0.8252258896827698, "eval_rewards/confidence_uniqueness_reward": 0.8909785449504852, "eval_rewards/format_reward": 0.9939236144224802, "eval_rewards/frontier_entropy_batch_reward": -0.9939236144224802, "eval_runtime": 178.0731, "eval_samples_per_second": 5.616, "eval_signal/accuracy_reward/centered_abs_mean": 0.4032660573720932, "eval_signal/accuracy_reward/group_std_mean": 0.4539312819639842, "eval_signal/accuracy_reward/group_zero_std_frac": 0.0, "eval_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9079023202260336, "eval_signal/accuracy_reward/weight": 0.5, "eval_signal/accuracy_reward/weighted_centered_abs_mean": 0.2016330286860466, "eval_signal/advantage_abs_mean": 0.8540914952754974, "eval_signal/advantage_pre_scale_abs_mean": 0.19251804302136102, "eval_signal/advantage_pre_scale_std": 0.22243775675694147, "eval_signal/advantage_std": 0.9863770306110382, "eval_signal/batch_coverage_0/centered_abs_mean": 0.17725825061400732, "eval_signal/batch_coverage_0/group_std_mean": 0.25420698275168735, "eval_signal/batch_coverage_0/group_zero_std_frac": 0.0, "eval_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07988938316702843, "eval_signal/batch_coverage_0/weight": 0.10000000149011612, "eval_signal/batch_coverage_0/weighted_centered_abs_mean": 0.017725825930635136, "eval_signal/brier_reward/centered_abs_mean": 0.17397530128558478, "eval_signal/brier_reward/group_std_mean": 0.23295909663041434, "eval_signal/brier_reward/group_zero_std_frac": 0.0, "eval_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07850411782662074, "eval_signal/brier_reward/weight": 0.10000000149011612, "eval_signal/brier_reward/weighted_centered_abs_mean": 0.01739752929036816, "eval_signal/confidence_uniqueness_reward/centered_abs_mean": 0.046860777462522187, "eval_signal/confidence_uniqueness_reward/group_std_mean": 0.069014647975564, "eval_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "eval_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.02106233189503352, "eval_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "eval_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.004686077823862433, "eval_signal/format_reward/centered_abs_mean": 0.01177300326526165, "eval_signal/format_reward/group_std_mean": 0.034373246133327484, "eval_signal/format_reward/group_zero_std_frac": 0.8055555721124014, "eval_signal/format_reward/scaled_weighted_centered_abs_mean": 0.025194636546075344, "eval_signal/format_reward/weight": 0.5, "eval_signal/format_reward/weighted_centered_abs_mean": 0.005886501632630825, "eval_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.01177300326526165, "eval_signal/frontier_entropy_batch_reward/group_std_mean": 0.034373246133327484, "eval_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.8055555721124014, "eval_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.005038927309215069, "eval_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "eval_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0011773004274194439, "eval_steps_per_second": 0.034, "step": 200 }, { "epoch": 0.47999400007499904, "step": 200, "train_probe_completions/clipped_ratio": 0.011284722222222229, "train_probe_completions/max_length": 2415.6666666666665, "train_probe_completions/max_terminated_length": 2415.6666666666665, "train_probe_completions/mean_length": 747.6194661458334, "train_probe_completions/mean_terminated_length": 756.1844787597656, "train_probe_completions/min_length": 45.333333333333336, "train_probe_completions/min_terminated_length": 237.33333333333334, "train_probe_loss": 0.0, "train_probe_num_tokens": 421718896.0, "train_probe_reward": 0.9298777182896932, "train_probe_reward_std": 0.23085811734199524, "train_probe_rewards/accuracy_reward": 0.7187499900658926, "train_probe_rewards/batch_coverage_0": 0.045416015510757766, "train_probe_rewards/brier_reward": 0.8248612880706787, "train_probe_rewards/confidence_uniqueness_reward": 0.8833604454994202, "train_probe_rewards/format_reward": 0.9878472288449606, "train_probe_rewards/frontier_entropy_batch_reward": -0.9878472288449606, "train_probe_runtime": 205.2798, "train_probe_samples_per_second": 4.871, "train_probe_signal/accuracy_reward/centered_abs_mean": 0.3943142394224803, "train_probe_signal/accuracy_reward/group_std_mean": 0.4489223013321559, "train_probe_signal/accuracy_reward/group_zero_std_frac": 0.0, "train_probe_signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.8741428951422373, "train_probe_signal/accuracy_reward/weight": 0.5, "train_probe_signal/accuracy_reward/weighted_centered_abs_mean": 0.19715711971124014, "train_probe_signal/advantage_abs_mean": 0.8349892695744833, "train_probe_signal/advantage_pre_scale_abs_mean": 0.1936338966091474, "train_probe_signal/advantage_pre_scale_std": 0.23024354626735052, "train_probe_signal/advantage_std": 0.9863850375016531, "train_probe_signal/batch_coverage_0/centered_abs_mean": 0.16201606268684068, "train_probe_signal/batch_coverage_0/group_std_mean": 0.2380441203713417, "train_probe_signal/batch_coverage_0/group_zero_std_frac": 0.0, "train_probe_signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.07158729247748852, "train_probe_signal/batch_coverage_0/weight": 0.10000000149011612, "train_probe_signal/batch_coverage_0/weighted_centered_abs_mean": 0.016201607262094814, "train_probe_signal/brier_reward/centered_abs_mean": 0.17809604605038962, "train_probe_signal/brier_reward/group_std_mean": 0.24038559198379517, "train_probe_signal/brier_reward/group_zero_std_frac": 0.0, "train_probe_signal/brier_reward/scaled_weighted_centered_abs_mean": 0.07882465422153473, "train_probe_signal/brier_reward/weight": 0.10000000149011612, "train_probe_signal/brier_reward/weighted_centered_abs_mean": 0.017809605070700247, "train_probe_signal/confidence_uniqueness_reward/centered_abs_mean": 0.0523125467201074, "train_probe_signal/confidence_uniqueness_reward/group_std_mean": 0.09005353599786758, "train_probe_signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "train_probe_signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.023148629814386368, "train_probe_signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "train_probe_signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.005231254966929555, "train_probe_signal/format_reward/centered_abs_mean": 0.023437499689559143, "train_probe_signal/format_reward/group_std_mean": 0.0657570833961169, "train_probe_signal/format_reward/group_zero_std_frac": 0.6388889104127884, "train_probe_signal/format_reward/scaled_weighted_centered_abs_mean": 0.0509293849269549, "train_probe_signal/format_reward/weight": 0.5, "train_probe_signal/format_reward/weighted_centered_abs_mean": 0.011718749844779571, "train_probe_signal/frontier_entropy_batch_reward/centered_abs_mean": 0.023437499689559143, "train_probe_signal/frontier_entropy_batch_reward/group_std_mean": 0.0657570833961169, "train_probe_signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.6388889104127884, "train_probe_signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.01018587724926571, "train_probe_signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "train_probe_signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.0023437499767169356, "train_probe_steps_per_second": 0.029 }, { "calibration/aurc": 0.12239767954214946, "calibration/batch_distribution_entropy": 0.9301196629465762, "calibration/batch_entropy_100bins": 0.9379092579008269, "calibration/batch_entropy_10bins": 0.9301196629465762, "calibration/batch_entropy_50bins": 0.9458583871040073, "calibration/batch_uniqueness": 0.9434521817203038, "calibration/buffer_distribution_entropy": 0.9698106620607934, "calibration/buffer_entropy_100bins": 0.9843158921249298, "calibration/buffer_entropy_10bins": 0.9698106620607934, "calibration/buffer_entropy_50bins": 0.9816746594560855, "calibration/confidence_entropy": 0.4687717514121054, "calibration/coverage@0%": 0.039820526775052816, "calibration/coverage@1%": 0.10648719344171949, "calibration/coverage@10%": 0.4955941593184532, "calibration/coverage@15%": 0.8548545673739323, "calibration/coverage@20%": 0.9153638157894737, "calibration/coverage@25%": 0.9544982456140352, "calibration/coverage@30%": 0.9946666666666667, "calibration/coverage@5%": 0.24880902222834572, "calibration/distribution_entropy_10": 0.9301196629465762, "calibration/distribution_entropy_100": 0.9379092579008269, "calibration/ece": 0.18600186920056827, "calibration/mean_confidence": 0.6150777857385921, "calibration/unique_confidence_per_question": 0.9921875, "calibration/unique_confidences": 381.0, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009461805555555581, "completions/max_length": 3795.2, "completions/max_terminated_length": 3795.2, "completions/mean_length": 751.1654418945312, "completions/mean_terminated_length": 758.399560546875, "completions/min_length": 0.0, "completions/min_terminated_length": 218.4, "epoch": 0.491993850076874, "grad_norm": 0.0028297537937760353, "learning_rate": 9.036144578313253e-08, "loss": -0.0244, "num_tokens": 433438274.0, "reward": 1.0493927717208862, "reward_std": 0.11577230393886566, "rewards/accuracy_reward": 0.7571180462837219, "rewards/batch_coverage_0": 0.30967661142349245, "rewards/brier_reward": 0.8156781673431397, "rewards/confidence_uniqueness_reward": 0.9383305668830871, "rewards/format_reward": 0.9903645753860474, "rewards/frontier_entropy_batch_reward": -0.3071707904338837, "signal/accuracy_reward/centered_abs_mean": 0.12980685979127884, "signal/accuracy_reward/group_std_mean": 0.17701960504055023, "signal/accuracy_reward/group_zero_std_frac": 0.4694444477558136, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 0.9657306671142578, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.06490342989563942, "signal/advantage_abs_mean": 0.737499189376831, "signal/advantage_pre_scale_abs_mean": 0.08461317420005798, "signal/advantage_pre_scale_std": 0.14267933070659639, "signal/advantage_std": 0.982995581626892, "signal/batch_coverage_0/centered_abs_mean": 0.14645980596542357, "signal/batch_coverage_0/group_std_mean": 0.18418991863727568, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.21881715953350067, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.014645980298519134, "signal/brier_reward/centered_abs_mean": 0.1212809681892395, "signal/brier_reward/group_std_mean": 0.15627764761447907, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.18062220215797425, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.012128096260130405, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.028160134702920912, "signal/confidence_uniqueness_reward/group_std_mean": 0.044972692430019376, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.041933953016996386, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.002816013665869832, "signal/format_reward/centered_abs_mean": 0.015283203311264515, "signal/format_reward/group_std_mean": 0.029318978637456895, "signal/format_reward/group_zero_std_frac": 0.8777777791023255, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.1133392333984375, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.007641601655632257, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.33200740814208984, "signal/frontier_entropy_batch_reward/group_std_mean": 0.39931764602661135, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.4956660270690918, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.03320074304938316, "step": 205 }, { "calibration/aurc": 0.09601065062995157, "calibration/batch_distribution_entropy": 0.9517801328093567, "calibration/batch_entropy_100bins": 0.944623264633357, "calibration/batch_entropy_10bins": 0.9517801328093567, "calibration/batch_entropy_50bins": 0.9570065362621402, "calibration/batch_uniqueness": 0.9459938709762948, "calibration/buffer_distribution_entropy": 0.9700833201805792, "calibration/buffer_entropy_100bins": 0.9844871800056595, "calibration/buffer_entropy_10bins": 0.9700833201805792, "calibration/buffer_entropy_50bins": 0.981872492355922, "calibration/confidence_entropy": 0.4824661780354327, "calibration/coverage@0%": 0.09961418008127575, "calibration/coverage@1%": 0.1136124302999984, "calibration/coverage@10%": 0.6836619312925309, "calibration/coverage@15%": 0.788315298707766, "calibration/coverage@20%": 0.8868581375108789, "calibration/coverage@25%": 0.9373368146214099, "calibration/coverage@30%": 0.9904264577893821, "calibration/coverage@5%": 0.29698768071745607, "calibration/distribution_entropy_10": 0.9517801328093567, "calibration/distribution_entropy_100": 0.944623264633357, "calibration/ece": 0.12387473016449674, "calibration/mean_confidence": 0.6253967837519441, "calibration/unique_confidence_per_question": 0.9956597222222223, "calibration/unique_confidences": 382.3333333333333, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500000000036, "completions/max_length": 3279.3333333333335, "completions/max_terminated_length": 3279.3333333333335, "completions/mean_length": 759.5371907552084, "completions/mean_terminated_length": 765.6172485351562, "completions/min_length": 0.0, "completions/min_terminated_length": 217.33333333333334, "epoch": 0.49919376007799904, "num_tokens": 440554243.0, "reward": 1.028142770131429, "reward_std": 0.1230898251136144, "rewards/accuracy_reward": 0.7016782363255819, "rewards/batch_coverage_0": 0.3386853138605754, "rewards/brier_reward": 0.8224726319313049, "rewards/confidence_uniqueness_reward": 0.9401467442512512, "rewards/format_reward": 0.9921875, "rewards/frontier_entropy_batch_reward": -0.2892061372598012, "signal/accuracy_reward/centered_abs_mean": 0.15324797481298447, "signal/accuracy_reward/group_std_mean": 0.19617354373137155, "signal/accuracy_reward/group_zero_std_frac": 0.46759259700775146, "signal/accuracy_reward/scaled_weighted_centered_abs_mean": 1.0735934575398762, "signal/accuracy_reward/weight": 0.5, "signal/accuracy_reward/weighted_centered_abs_mean": 0.07662398740649223, "signal/advantage_abs_mean": 0.7517887155214945, "signal/advantage_pre_scale_abs_mean": 0.09261459857225418, "signal/advantage_pre_scale_std": 0.14849475771188736, "signal/advantage_std": 0.9830325643221537, "signal/batch_coverage_0/centered_abs_mean": 0.15849936505158743, "signal/batch_coverage_0/group_std_mean": 0.20025220016638437, "signal/batch_coverage_0/group_zero_std_frac": 0.0, "signal/batch_coverage_0/scaled_weighted_centered_abs_mean": 0.23052164912223816, "signal/batch_coverage_0/weight": 0.10000000149011612, "signal/batch_coverage_0/weighted_centered_abs_mean": 0.01584993799527486, "signal/brier_reward/centered_abs_mean": 0.12450356781482697, "signal/brier_reward/group_std_mean": 0.16133702794710794, "signal/brier_reward/group_zero_std_frac": 0.0, "signal/brier_reward/scaled_weighted_centered_abs_mean": 0.1783244808514913, "signal/brier_reward/weight": 0.10000000149011612, "signal/brier_reward/weighted_centered_abs_mean": 0.012450356036424637, "signal/confidence_uniqueness_reward/centered_abs_mean": 0.026240961626172066, "signal/confidence_uniqueness_reward/group_std_mean": 0.04592407991488775, "signal/confidence_uniqueness_reward/group_zero_std_frac": 0.0, "signal/confidence_uniqueness_reward/scaled_weighted_centered_abs_mean": 0.037502444038788475, "signal/confidence_uniqueness_reward/weight": 0.10000000149011612, "signal/confidence_uniqueness_reward/weighted_centered_abs_mean": 0.0026240963488817215, "signal/format_reward/centered_abs_mean": 0.013834635416666666, "signal/format_reward/group_std_mean": 0.031048130244016647, "signal/format_reward/group_zero_std_frac": 0.8518518606821696, "signal/format_reward/scaled_weighted_centered_abs_mean": 0.09567297746737798, "signal/format_reward/weight": 0.5, "signal/format_reward/weighted_centered_abs_mean": 0.006917317708333333, "signal/frontier_entropy_batch_reward/centered_abs_mean": 0.3136086066563924, "signal/frontier_entropy_batch_reward/group_std_mean": 0.3837442596753438, "signal/frontier_entropy_batch_reward/group_zero_std_frac": 0.0, "signal/frontier_entropy_batch_reward/scaled_weighted_centered_abs_mean": 0.45284825563430786, "signal/frontier_entropy_batch_reward/weight": 0.10000000149011612, "signal/frontier_entropy_batch_reward/weighted_centered_abs_mean": 0.031360860293110214, "step": 208, "total_flos": 0.0, "train_loss": -0.04336074522535245, "train_runtime": 40363.2953, "train_samples_per_second": 0.372, "train_steps_per_second": 0.005 } ], "logging_steps": 5, "max_steps": 208, "num_input_tokens_seen": 440554243, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }