Files
PureRL-1.5B-v7-s2-l1-maskon…/trainer_state.json
ModelHub XC 4d644bc719 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l1-maskon-afew
Source: Original Platform
2026-06-04 15:55:33 +08:00

12243 lines
501 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.47760647535324097,
"adv/mean_abs_reasoning": 0.4569147527217865,
"adv/mean_abs_step_conf": 0.7636127471923828,
"adv/ratio_final_to_reasoning": 1.0452857398632815,
"adv/ratio_step_to_reasoning": 1.6712367955808674,
"adv/std_final_conf": 0.7227410674095154,
"adv/std_reasoning": 0.7206857204437256,
"adv/std_step_conf": 0.9327075481414795,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5086206896551725,
"calib/avg_num_step_conf": 7.875,
"calib/ece": 0.2888991935483871,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001713264989126051,
"calib/mean_conf": 0.9905120967741936,
"calib/mu_c": 0.9905632183908043,
"calib/mu_w": 0.9903918918918917,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2888991935483871,
"calib/std_conf": 0.0021794159006610276,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9119477557027226,
"calib/step_q_c_n": 1359.0,
"calib/step_q_gap": 0.0056311651395566376,
"calib/step_q_w": 0.9063165905631659,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2494.0,
"completions/max_terminated_length": 2494.0,
"completions/mean_length": 755.49609375,
"completions/mean_terminated_length": 776.7349243164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.02494852803647518,
"kl": 0.0005849599838256836,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.1254,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018737709149718285,
"mask/share_reasoning": 0.845859944820404,
"mask/share_step_conf": 0.10805858671665192,
"num_tokens": 300991.0,
"reward": 0.8340441584587097,
"reward_std": 0.239455908536911,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.6851503849029541,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.6532504558563232,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.437887966632843,
"adv/mean_abs_reasoning": 0.4207462966442108,
"adv/mean_abs_step_conf": 0.7377474308013916,
"adv/ratio_final_to_reasoning": 1.0407411072310102,
"adv/ratio_step_to_reasoning": 1.7534258451839484,
"adv/std_final_conf": 0.6832791566848755,
"adv/std_reasoning": 0.6817297339439392,
"adv/std_step_conf": 0.9317809343338013,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4872611464968153,
"calib/avg_num_step_conf": 7.6953125,
"calib/ece": 0.36465737051792824,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00024481637078155316,
"calib/mean_conf": 0.9901553784860557,
"calib/mu_c": 0.990063694267516,
"calib/mu_w": 0.9903085106382975,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36465737051792824,
"calib/std_conf": 0.001222205307190084,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9075405636208369,
"calib/step_q_c_n": 1171.0,
"calib/step_q_gap": -0.003804868168900244,
"calib/step_q_w": 0.9113454317897371,
"calib/step_q_w_n": 799.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2743.0,
"completions/max_terminated_length": 2743.0,
"completions/mean_length": 840.640625,
"completions/mean_terminated_length": 850.6087036132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 466.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.02235162816941738,
"kl": 0.0016820430755615234,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0082,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01788702979683876,
"mask/share_reasoning": 0.8706268668174744,
"mask/share_step_conf": 0.09976735711097717,
"num_tokens": 619483.0,
"reward": 0.7749876976013184,
"reward_std": 0.22135242819786072,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6203019618988037,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6109234094619751,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.4195403754711151,
"adv/mean_abs_reasoning": 0.39365625381469727,
"adv/mean_abs_step_conf": 0.7564386129379272,
"adv/ratio_final_to_reasoning": 1.065753106690392,
"adv/ratio_step_to_reasoning": 1.9215714360122924,
"adv/std_final_conf": 0.6908338665962219,
"adv/std_reasoning": 0.6816103458404541,
"adv/std_step_conf": 0.9311906695365906,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.49362723046933576,
"calib/avg_num_step_conf": 7.76953125,
"calib/ece": 0.3032128514056225,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00024516419253228783,
"calib/mean_conf": 0.9899598393574297,
"calib/mu_c": 0.9898830409356724,
"calib/mu_w": 0.9901282051282047,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3032128514056225,
"calib/std_conf": 0.002101441839670866,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9130769230769231,
"calib/step_q_c_n": 1287.0,
"calib/step_q_gap": 0.0029487179487178716,
"calib/step_q_w": 0.9101282051282052,
"calib/step_q_w_n": 702.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 794.72265625,
"completions/mean_terminated_length": 810.5538330078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.0032,
"grad_norm": 0.022143350914120674,
"kl": 0.0004946589469909668,
"learning_rate": 7.5e-07,
"loss": -0.1178,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.018371999263763428,
"mask/share_reasoning": 0.8613105416297913,
"mask/share_step_conf": 0.1007862240076065,
"num_tokens": 928188.0,
"reward": 0.8207460641860962,
"reward_std": 0.19138771295547485,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.673882007598877,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.6394850611686707,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.4774017930030823,
"adv/mean_abs_reasoning": 0.4621298909187317,
"adv/mean_abs_step_conf": 0.7561236619949341,
"adv/ratio_final_to_reasoning": 1.0330467740443914,
"adv/ratio_step_to_reasoning": 1.6361712948100624,
"adv/std_final_conf": 0.7400308847427368,
"adv/std_reasoning": 0.7393571734428406,
"adv/std_step_conf": 0.9313958287239075,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.49078106852497094,
"calib/avg_num_step_conf": 7.82421875,
"calib/ece": 0.31812000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00018437862950060335,
"calib/mean_conf": 0.9901200000000001,
"calib/mu_c": 0.9900595238095237,
"calib/mu_w": 0.9902439024390243,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31812000000000007,
"calib/std_conf": 0.001088852607105297,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9106230529595015,
"calib/step_q_c_n": 1284.0,
"calib/step_q_gap": 0.0040305633906558835,
"calib/step_q_w": 0.9065924895688456,
"calib/step_q_w_n": 719.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2919.0,
"completions/max_terminated_length": 2919.0,
"completions/mean_length": 786.1640625,
"completions/mean_terminated_length": 792.3543090820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 402.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.020883694291114807,
"kl": 0.0005683302879333496,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.017,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018774813041090965,
"mask/share_reasoning": 0.8692216873168945,
"mask/share_step_conf": 0.10419103503227234,
"num_tokens": 1235614.0,
"reward": 0.8037224411964417,
"reward_std": 0.24146485328674316,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.6623257398605347,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.6193378567695618,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.4146665036678314,
"adv/mean_abs_reasoning": 0.40848180651664734,
"adv/mean_abs_step_conf": 0.7514338493347168,
"adv/ratio_final_to_reasoning": 1.015140691831356,
"adv/ratio_step_to_reasoning": 1.8395772769970178,
"adv/std_final_conf": 0.7020092010498047,
"adv/std_reasoning": 0.7015328407287598,
"adv/std_step_conf": 0.9299687147140503,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.5117074181516784,
"calib/avg_num_step_conf": 7.5,
"calib/ece": 0.46298340248962655,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0005958695952477111,
"calib/mean_conf": 0.989954356846473,
"calib/mu_c": 0.9902362204724408,
"calib/mu_w": 0.989640350877193,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46298340248962655,
"calib/std_conf": 0.0030761415755768723,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.913924949290061,
"calib/step_q_c_n": 986.0,
"calib/step_q_gap": 0.011276127020253712,
"calib/step_q_w": 0.9026488222698072,
"calib/step_q_w_n": 934.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2891.0,
"completions/max_terminated_length": 2891.0,
"completions/mean_length": 818.61328125,
"completions/mean_terminated_length": 851.8901977539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 476.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.024567440152168274,
"kl": 0.000652611255645752,
"learning_rate": 1.25e-06,
"loss": -0.1111,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.017622657120227814,
"mask/share_reasoning": 0.8454317450523376,
"mask/share_step_conf": 0.09788313508033752,
"num_tokens": 1551867.0,
"reward": 0.6550729274749756,
"reward_std": 0.2066141963005066,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.50483238697052,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l1_reward": 0.5193760395050049,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.43477505445480347,
"adv/mean_abs_reasoning": 0.397172749042511,
"adv/mean_abs_step_conf": 0.7250959277153015,
"adv/ratio_final_to_reasoning": 1.094674938053889,
"adv/ratio_step_to_reasoning": 1.8256437015463305,
"adv/std_final_conf": 0.7204685211181641,
"adv/std_reasoning": 0.7015130519866943,
"adv/std_step_conf": 0.9325665235519409,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.479936221100186,
"calib/avg_num_step_conf": 8.28125,
"calib/ece": 0.4176612903225808,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00040127557799640723,
"calib/mean_conf": 0.9902419354838712,
"calib/mu_c": 0.990070422535211,
"calib/mu_w": 0.9904716981132075,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4176612903225808,
"calib/std_conf": 0.0015364966841336825,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9122771403353929,
"calib/step_q_c_n": 1133.0,
"calib/step_q_gap": -0.003421947810503867,
"calib/step_q_w": 0.9156990881458967,
"calib/step_q_w_n": 987.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 1835.0,
"completions/max_terminated_length": 1835.0,
"completions/mean_length": 726.96484375,
"completions/mean_terminated_length": 747.4015502929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 457.0,
"epoch": 0.0064,
"grad_norm": 0.03217747434973717,
"kl": 0.0018596649169921875,
"learning_rate": 1.5e-06,
"loss": -0.1455,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.019399240612983704,
"mask/share_reasoning": 0.8410153388977051,
"mask/share_step_conf": 0.1122417226433754,
"num_tokens": 1843922.0,
"reward": 0.7091950178146362,
"reward_std": 0.22388148307800293,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.562483549118042,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.5512189269065857,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.44878897070884705,
"adv/mean_abs_reasoning": 0.37493398785591125,
"adv/mean_abs_step_conf": 0.7872533202171326,
"adv/ratio_final_to_reasoning": 1.196981296028352,
"adv/ratio_step_to_reasoning": 2.0997118045208465,
"adv/std_final_conf": 0.7194306254386902,
"adv/std_reasoning": 0.6612817049026489,
"adv/std_step_conf": 0.9330607652664185,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4866102889358703,
"calib/avg_num_step_conf": 7.48828125,
"calib/ece": 0.33301593625498005,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00024348132487661545,
"calib/mean_conf": 0.9903864541832669,
"calib/mu_c": 0.9903030303030301,
"calib/mu_w": 0.9905465116279067,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33301593625498005,
"calib/std_conf": 0.0018993749697701293,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9096,
"calib/step_q_c_n": 1250.0,
"calib/step_q_gap": 0.0018638680659669449,
"calib/step_q_w": 0.907736131934033,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 862.09375,
"completions/mean_terminated_length": 868.8818969726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.041077230125665665,
"kl": 0.0005017518997192383,
"learning_rate": 1.75e-06,
"loss": -0.007,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.017408661544322968,
"mask/share_reasoning": 0.8800719380378723,
"mask/share_step_conf": 0.09470690786838531,
"num_tokens": 2172042.0,
"reward": 0.8021071553230286,
"reward_std": 0.20688104629516602,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6507886648178101,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6284255981445312,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.38437363505363464,
"adv/mean_abs_reasoning": 0.3675369918346405,
"adv/mean_abs_step_conf": 0.7491437196731567,
"adv/ratio_final_to_reasoning": 1.0458093840702956,
"adv/ratio_step_to_reasoning": 2.0382811426236134,
"adv/std_final_conf": 0.6616430282592773,
"adv/std_reasoning": 0.6612933278083801,
"adv/std_step_conf": 0.932352602481842,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5152616829508445,
"calib/avg_num_step_conf": 7.53125,
"calib/ece": 0.32172131147540983,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9959016393442623,
"calib/gap": 0.0012951601908653965,
"calib/mean_conf": 0.9897540983606558,
"calib/mu_c": 0.9901840490797544,
"calib/mu_w": 0.988888888888889,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32172131147540983,
"calib/std_conf": 0.005862231818340563,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9095775792038994,
"calib/step_q_c_n": 1231.0,
"calib/step_q_gap": 0.006005125832306835,
"calib/step_q_w": 0.9035724533715925,
"calib/step_q_w_n": 697.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2495.0,
"completions/max_terminated_length": 2495.0,
"completions/mean_length": 813.83984375,
"completions/mean_terminated_length": 833.3720092773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.020675357431173325,
"kl": 0.0009307861328125,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.1701,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01763816736638546,
"mask/share_reasoning": 0.8619741201400757,
"mask/share_step_conf": 0.0969502180814743,
"num_tokens": 2486897.0,
"reward": 0.790767252445221,
"reward_std": 0.17744673788547516,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6436171531677246,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.6199485659599304,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.4865243136882782,
"adv/mean_abs_reasoning": 0.4591737985610962,
"adv/mean_abs_step_conf": 0.7573858499526978,
"adv/ratio_final_to_reasoning": 1.059564625013208,
"adv/ratio_step_to_reasoning": 1.6494535453157448,
"adv/std_final_conf": 0.7589576244354248,
"adv/std_reasoning": 0.7576943039894104,
"adv/std_step_conf": 0.9348429441452026,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.5019674935842601,
"calib/avg_num_step_conf": 8.203125,
"calib/ece": 0.2855232067510547,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 5.1069289991545475e-05,
"calib/mean_conf": 0.9901645569620252,
"calib/mu_c": 0.9901796407185626,
"calib/mu_w": 0.990128571428571,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2855232067510547,
"calib/std_conf": 0.001257185723335205,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9104324324324324,
"calib/step_q_c_n": 1295.0,
"calib/step_q_gap": 0.05269951317777399,
"calib/step_q_w": 0.8577329192546584,
"calib/step_q_w_n": 805.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2535.0,
"completions/max_terminated_length": 2535.0,
"completions/mean_length": 785.21875,
"completions/mean_terminated_length": 823.8359985351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.0096,
"grad_norm": 0.021421346813440323,
"kl": 0.0006436705589294434,
"learning_rate": 2.25e-06,
"loss": -0.1237,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.017515994608402252,
"mask/share_reasoning": 0.837012529373169,
"mask/share_step_conf": 0.09859649091959,
"num_tokens": 2795449.0,
"reward": 0.7842223644256592,
"reward_std": 0.26130688190460205,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6576511859893799,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": 0.5951685905456543,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.48653459548950195,
"adv/mean_abs_reasoning": 0.47618716955184937,
"adv/mean_abs_step_conf": 0.786577582359314,
"adv/ratio_final_to_reasoning": 1.0217297453591427,
"adv/ratio_step_to_reasoning": 1.6518243931258798,
"adv/std_final_conf": 0.7380922436714172,
"adv/std_reasoning": 0.7393761873245239,
"adv/std_step_conf": 0.9339465498924255,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.4885918253079507,
"calib/avg_num_step_conf": 7.77734375,
"calib/ece": 0.37236585365853647,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00017749160134328257,
"calib/mean_conf": 0.9902520325203251,
"calib/mu_c": 0.9901842105263159,
"calib/mu_w": 0.9903617021276592,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37236585365853647,
"calib/std_conf": 0.0014984734559308761,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9109103690685413,
"calib/step_q_c_n": 1138.0,
"calib/step_q_gap": -0.004569115104963939,
"calib/step_q_w": 0.9154794841735052,
"calib/step_q_w_n": 853.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2629.0,
"completions/max_terminated_length": 2629.0,
"completions/mean_length": 806.5625,
"completions/mean_terminated_length": 825.9200439453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 491.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.016852308064699173,
"kl": 0.0006428956985473633,
"learning_rate": 2.5e-06,
"loss": -0.1333,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01787388324737549,
"mask/share_reasoning": 0.8611484169960022,
"mask/share_step_conf": 0.09754019975662231,
"num_tokens": 3108729.0,
"reward": 0.7515081167221069,
"reward_std": 0.24571159482002258,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6007345914840698,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5913439989089966,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.4119076430797577,
"adv/mean_abs_reasoning": 0.3775365948677063,
"adv/mean_abs_step_conf": 0.7417552471160889,
"adv/ratio_final_to_reasoning": 1.0910403088847465,
"adv/ratio_step_to_reasoning": 1.964724101450376,
"adv/std_final_conf": 0.7001214623451233,
"adv/std_reasoning": 0.6815639734268188,
"adv/std_step_conf": 0.9311968088150024,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.49538041576258135,
"calib/avg_num_step_conf": 7.19140625,
"calib/ece": 0.35708870967741946,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -8.77721005110299e-05,
"calib/mean_conf": 0.9901532258064517,
"calib/mu_c": 0.9901210191082801,
"calib/mu_w": 0.9902087912087911,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35708870967741946,
"calib/std_conf": 0.0011984153168213823,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9105598650927487,
"calib/step_q_c_n": 1186.0,
"calib/step_q_gap": 0.0014102467721378575,
"calib/step_q_w": 0.9091496183206108,
"calib/step_q_w_n": 655.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2731.0,
"completions/max_terminated_length": 2731.0,
"completions/mean_length": 855.82421875,
"completions/mean_terminated_length": 865.9723510742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 319.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.017827648669481277,
"kl": 0.0007291436195373535,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0407,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.017785221338272095,
"mask/share_reasoning": 0.8739660978317261,
"mask/share_step_conf": 0.09652996808290482,
"num_tokens": 3432300.0,
"reward": 0.7854070663452148,
"reward_std": 0.1878044605255127,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6201468706130981,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.6342610120773315,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.5210590362548828,
"adv/mean_abs_reasoning": 0.4791218936443329,
"adv/mean_abs_step_conf": 0.7472091913223267,
"adv/ratio_final_to_reasoning": 1.0875291719432074,
"adv/ratio_step_to_reasoning": 1.5595388172284261,
"adv/std_final_conf": 0.7741323709487915,
"adv/std_reasoning": 0.7577143907546997,
"adv/std_step_conf": 0.9301572442054749,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.49634538152610447,
"calib/avg_num_step_conf": 8.14453125,
"calib/ece": 0.301286307053942,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -7.309236947772302e-05,
"calib/mean_conf": 0.9900829875518673,
"calib/mu_c": 0.9900602409638553,
"calib/mu_w": 0.990133333333333,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.301286307053942,
"calib/std_conf": 0.0009071871829491895,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9150713266761767,
"calib/step_q_c_n": 1402.0,
"calib/step_q_gap": 0.01657937938481513,
"calib/step_q_w": 0.8984919472913616,
"calib/step_q_w_n": 683.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2883.0,
"completions/max_terminated_length": 2883.0,
"completions/mean_length": 728.11328125,
"completions/mean_terminated_length": 770.2355346679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 379.0,
"epoch": 0.0128,
"grad_norm": 0.046004943549633026,
"kl": 0.0010104775428771973,
"learning_rate": 3e-06,
"loss": -0.2455,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018598269671201706,
"mask/share_reasoning": 0.8154910802841187,
"mask/share_step_conf": 0.11122316122055054,
"num_tokens": 3722873.0,
"reward": 0.7990297675132751,
"reward_std": 0.24911078810691833,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6502195000648499,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.6306524276733398,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.3958421051502228,
"adv/mean_abs_reasoning": 0.36470189690589905,
"adv/mean_abs_step_conf": 0.7317424416542053,
"adv/ratio_final_to_reasoning": 1.085385375037845,
"adv/ratio_step_to_reasoning": 2.006412491578048,
"adv/std_final_conf": 0.6820527911186218,
"adv/std_reasoning": 0.6613563299179077,
"adv/std_step_conf": 0.9328783750534058,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5020423892100193,
"calib/avg_num_step_conf": 7.953125,
"calib/ece": 0.2925,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -7.552986512504578e-05,
"calib/mean_conf": 0.9900806451612904,
"calib/mu_c": 0.9900578034682079,
"calib/mu_w": 0.990133333333333,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2925,
"calib/std_conf": 0.0023745857845702367,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9135067873303169,
"calib/step_q_c_n": 1326.0,
"calib/step_q_gap": 0.008746223950034993,
"calib/step_q_w": 0.9047605633802819,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1835.0,
"completions/max_terminated_length": 1835.0,
"completions/mean_length": 771.23046875,
"completions/mean_terminated_length": 786.5936279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.025861399248242378,
"kl": 0.010151028633117676,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.1128,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01878371834754944,
"mask/share_reasoning": 0.8545902967453003,
"mask/share_step_conf": 0.10709473490715027,
"num_tokens": 4024900.0,
"reward": 0.8200558423995972,
"reward_std": 0.19809523224830627,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.6814616918563843,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.629743754863739,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.4357926845550537,
"adv/mean_abs_reasoning": 0.4326225519180298,
"adv/mean_abs_step_conf": 0.757068932056427,
"adv/ratio_final_to_reasoning": 1.007327710085776,
"adv/ratio_step_to_reasoning": 1.7499525364546207,
"adv/std_final_conf": 0.720037579536438,
"adv/std_reasoning": 0.7205063104629517,
"adv/std_step_conf": 0.933967649936676,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5018094486116594,
"calib/avg_num_step_conf": 7.5078125,
"calib/ece": 0.3964820717131474,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 3.2570075009896726e-05,
"calib/mean_conf": 0.9901075697211156,
"calib/mu_c": 0.9901208053691273,
"calib/mu_w": 0.9900882352941174,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3964820717131474,
"calib/std_conf": 0.0009780369344452016,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9137217847769029,
"calib/step_q_c_n": 1143.0,
"calib/step_q_gap": 0.014770565264707769,
"calib/step_q_w": 0.8989512195121951,
"calib/step_q_w_n": 779.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 817.81640625,
"completions/mean_terminated_length": 827.5138549804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 438.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.0291427131742239,
"kl": 0.001428365707397461,
"learning_rate": 3.5e-06,
"loss": -0.0811,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.018288377672433853,
"mask/share_reasoning": 0.8677775859832764,
"mask/share_step_conf": 0.10221526026725769,
"num_tokens": 4339661.0,
"reward": 0.7529997825622559,
"reward_std": 0.21627689898014069,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.5898327827453613,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6036667227745056,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.43864238262176514,
"adv/mean_abs_reasoning": 0.41422170400619507,
"adv/mean_abs_step_conf": 0.7357077598571777,
"adv/ratio_final_to_reasoning": 1.0589555747064496,
"adv/ratio_step_to_reasoning": 1.7761207409985802,
"adv/std_final_conf": 0.7389703989028931,
"adv/std_reasoning": 0.7204132080078125,
"adv/std_step_conf": 0.9317857623100281,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.48568211068211065,
"calib/avg_num_step_conf": 7.5625,
"calib/ece": 0.432398406374502,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00021685971685947436,
"calib/mean_conf": 0.9901673306772908,
"calib/mu_c": 0.9900714285714285,
"calib/mu_w": 0.990288288288288,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.432398406374502,
"calib/std_conf": 0.0011992230412249103,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9132129629629631,
"calib/step_q_c_n": 1080.0,
"calib/step_q_gap": 0.0035552526825892716,
"calib/step_q_w": 0.9096577102803738,
"calib/step_q_w_n": 856.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1581.0,
"completions/max_terminated_length": 1581.0,
"completions/mean_length": 737.5234375,
"completions/mean_terminated_length": 752.2151489257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 421.0,
"epoch": 0.016,
"grad_norm": 0.025324687361717224,
"kl": 0.0011706352233886719,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0424,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01934191584587097,
"mask/share_reasoning": 0.8563178777694702,
"mask/share_step_conf": 0.1048089861869812,
"num_tokens": 4636347.0,
"reward": 0.7159147262573242,
"reward_std": 0.21545015275478363,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5552006363868713,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.571160078048706,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.3485468029975891,
"adv/mean_abs_reasoning": 0.341163694858551,
"adv/mean_abs_step_conf": 0.7923579216003418,
"adv/ratio_final_to_reasoning": 1.0216409549149101,
"adv/ratio_step_to_reasoning": 2.322515360049841,
"adv/std_final_conf": 0.6164584159851074,
"adv/std_reasoning": 0.6186478734016418,
"adv/std_step_conf": 0.9324037432670593,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.78515625,
"calib/ece": 0.3124489795918367,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 3.3306690738754696e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999995,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3124489795918367,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9126959247648903,
"calib/step_q_c_n": 1276.0,
"calib/step_q_gap": 0.005485325043830258,
"calib/step_q_w": 0.90721059972106,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3046.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 886.8671875,
"completions/mean_terminated_length": 900.9445190429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 436.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.026192739605903625,
"kl": 0.0015244483947753906,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0456,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.016579438000917435,
"mask/share_reasoning": 0.8733184337615967,
"mask/share_step_conf": 0.09447716176509857,
"num_tokens": 4972233.0,
"reward": 0.7912975549697876,
"reward_std": 0.16759978234767914,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6545136570930481,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6062064170837402,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.5062482357025146,
"adv/mean_abs_reasoning": 0.4920458197593689,
"adv/mean_abs_step_conf": 0.7667862176895142,
"adv/ratio_final_to_reasoning": 1.0288640109778624,
"adv/ratio_step_to_reasoning": 1.5583634427877162,
"adv/std_final_conf": 0.7398620247840881,
"adv/std_reasoning": 0.7394651770591736,
"adv/std_step_conf": 0.932716965675354,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5029411764705882,
"calib/avg_num_step_conf": 7.98046875,
"calib/ece": 0.29616326530612247,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 5.8823529411999864e-05,
"calib/mean_conf": 0.9900408163265306,
"calib/mu_c": 0.9900588235294118,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29616326530612247,
"calib/std_conf": 0.0006375714021148296,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9090542521994135,
"calib/step_q_c_n": 1364.0,
"calib/step_q_gap": 0.0075225879873369594,
"calib/step_q_w": 0.9015316642120765,
"calib/step_q_w_n": 679.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3004.0,
"completions/max_terminated_length": 3004.0,
"completions/mean_length": 786.265625,
"completions/mean_terminated_length": 808.3694458007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 350.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.03347507119178772,
"kl": 0.002835869789123535,
"learning_rate": 4.25e-06,
"loss": -0.1046,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.018158677965402603,
"mask/share_reasoning": 0.8490204811096191,
"mask/share_step_conf": 0.10547702014446259,
"num_tokens": 5277045.0,
"reward": 0.8143203258514404,
"reward_std": 0.26758062839508057,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.6698265075683594,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6338140964508057,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.4228481650352478,
"adv/mean_abs_reasoning": 0.4182824492454529,
"adv/mean_abs_step_conf": 0.738125205039978,
"adv/ratio_final_to_reasoning": 1.0109153893452405,
"adv/ratio_step_to_reasoning": 1.7646573657859543,
"adv/std_final_conf": 0.6994585990905762,
"adv/std_reasoning": 0.7013540863990784,
"adv/std_step_conf": 0.9333730340003967,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.36328125,
"calib/ece": 0.4357831325301206,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4357831325301206,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9089214758751183,
"calib/step_q_c_n": 1057.0,
"calib/step_q_gap": 0.00040698312149511917,
"calib/step_q_w": 0.9085144927536232,
"calib/step_q_w_n": 828.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 816.140625,
"completions/mean_terminated_length": 825.8182373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.0192,
"grad_norm": 0.03060261346399784,
"kl": 0.0027538537979125977,
"learning_rate": 4.5e-06,
"loss": -0.0052,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.01828933134675026,
"mask/share_reasoning": 0.8699996471405029,
"mask/share_step_conf": 0.09999223798513412,
"num_tokens": 5596697.0,
"reward": 0.6926410794258118,
"reward_std": 0.19804269075393677,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5476371049880981,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.5353012681007385,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.3152657747268677,
"adv/mean_abs_reasoning": 0.2920442521572113,
"adv/mean_abs_step_conf": 0.7524843811988831,
"adv/ratio_final_to_reasoning": 1.0795137120423652,
"adv/ratio_step_to_reasoning": 2.5766108240124197,
"adv/std_final_conf": 0.595735490322113,
"adv/std_reasoning": 0.5726684331893921,
"adv/std_step_conf": 0.9318787455558777,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4954545454545455,
"calib/avg_num_step_conf": 7.546875,
"calib/ece": 0.421407843137255,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -8.181818181784983e-05,
"calib/mean_conf": 0.9900352941176471,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.990081818181818,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.421407843137255,
"calib/std_conf": 0.0005624956747238556,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9118380952380952,
"calib/step_q_c_n": 1050.0,
"calib/step_q_gap": -0.001271882086167908,
"calib/step_q_w": 0.9131099773242631,
"calib/step_q_w_n": 882.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1819.0,
"completions/max_terminated_length": 1819.0,
"completions/mean_length": 774.671875,
"completions/mean_terminated_length": 777.7098388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.024774307385087013,
"kl": 0.0026723146438598633,
"learning_rate": 4.75e-06,
"loss": 0.0014,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.018909579142928123,
"mask/share_reasoning": 0.87503582239151,
"mask/share_step_conf": 0.10214833915233612,
"num_tokens": 5899773.0,
"reward": 0.7455166578292847,
"reward_std": 0.15792196989059448,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5748304128646851,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l1_reward": 0.6037027835845947,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.46466466784477234,
"adv/mean_abs_reasoning": 0.42911815643310547,
"adv/mean_abs_step_conf": 0.7354490756988525,
"adv/ratio_final_to_reasoning": 1.0828361859752913,
"adv/ratio_step_to_reasoning": 1.7138614730544512,
"adv/std_final_conf": 0.7561859488487244,
"adv/std_reasoning": 0.739301323890686,
"adv/std_step_conf": 0.9323933720588684,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5048543689320388,
"calib/avg_num_step_conf": 7.84765625,
"calib/ece": 0.4063157894736842,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9919028340080972,
"calib/gap": 0.0016504854368935007,
"calib/mean_conf": 0.9893117408906882,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9883495145631065,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4063157894736842,
"calib/std_conf": 0.008094331487680583,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9133758503401361,
"calib/step_q_c_n": 1176.0,
"calib/step_q_gap": 0.004708383353341339,
"calib/step_q_w": 0.9086674669867948,
"calib/step_q_w_n": 833.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 775.45703125,
"completions/mean_terminated_length": 790.9044189453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 427.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.03557265177369118,
"kl": 0.0034775733947753906,
"learning_rate": 5e-06,
"loss": -0.0898,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.0192178413271904,
"mask/share_reasoning": 0.8497927784919739,
"mask/share_step_conf": 0.11145815998315811,
"num_tokens": 6203162.0,
"reward": 0.7248783111572266,
"reward_std": 0.22109970450401306,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5716238021850586,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5734453201293945,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.49779677391052246,
"adv/mean_abs_reasoning": 0.483091801404953,
"adv/mean_abs_step_conf": 0.728674054145813,
"adv/ratio_final_to_reasoning": 1.030439292206582,
"adv/ratio_step_to_reasoning": 1.5083552484779181,
"adv/std_final_conf": 0.7767319679260254,
"adv/std_reasoning": 0.775422990322113,
"adv/std_step_conf": 0.9327322840690613,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4968944099378882,
"calib/avg_num_step_conf": 8.02734375,
"calib/ece": 0.34333333333333327,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00012422360248454112,
"calib/mean_conf": 0.9899196787148594,
"calib/mu_c": 0.9898757763975156,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34333333333333327,
"calib/std_conf": 0.0012649008632950713,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9148021722265323,
"calib/step_q_c_n": 1289.0,
"calib/step_q_gap": 0.005350475098594765,
"calib/step_q_w": 0.9094516971279375,
"calib/step_q_w_n": 766.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 776.6953125,
"completions/mean_terminated_length": 789.0238647460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.0224,
"grad_norm": 6.381322860717773,
"kl": 0.11067986488342285,
"learning_rate": 4.9722222222222224e-06,
"loss": 0.0292,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019109871238470078,
"mask/share_reasoning": 0.8561791181564331,
"mask/share_step_conf": 0.10908600687980652,
"num_tokens": 6504956.0,
"reward": 0.7912448644638062,
"reward_std": 0.26544326543807983,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6356808543205261,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.6264961957931519,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.2924252152442932,
"adv/mean_abs_reasoning": 0.291792631149292,
"adv/mean_abs_step_conf": 0.7459397315979004,
"adv/ratio_final_to_reasoning": 1.002167923475345,
"adv/ratio_step_to_reasoning": 2.5564035961423914,
"adv/std_final_conf": 0.5933868885040283,
"adv/std_reasoning": 0.5960041880607605,
"adv/std_step_conf": 0.932509183883667,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.94140625,
"calib/ece": 0.33262948207171317,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33262948207171317,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9163239875389407,
"calib/step_q_c_n": 1284.0,
"calib/step_q_gap": 0.003173119715175643,
"calib/step_q_w": 0.9131508678237651,
"calib/step_q_w_n": 749.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1642.0,
"completions/max_terminated_length": 1642.0,
"completions/mean_length": 726.9375,
"completions/mean_terminated_length": 741.4183349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.01994170807301998,
"kl": 0.004857063293457031,
"learning_rate": 4.944444444444445e-06,
"loss": -0.128,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.01959868147969246,
"mask/share_reasoning": 0.8490219116210938,
"mask/share_step_conf": 0.11184822022914886,
"num_tokens": 6792868.0,
"reward": 0.8057518005371094,
"reward_std": 0.1436442732810974,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6511518955230713,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6353515982627869,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.5219694375991821,
"adv/mean_abs_reasoning": 0.5116857290267944,
"adv/mean_abs_step_conf": 0.740062952041626,
"adv/ratio_final_to_reasoning": 1.0200977044873754,
"adv/ratio_step_to_reasoning": 1.4463232215782054,
"adv/std_final_conf": 0.7941882610321045,
"adv/std_reasoning": 0.7927516102790833,
"adv/std_step_conf": 0.9338461756706238,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.1015625,
"calib/ece": 0.382857142857143,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.382857142857143,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9151562499999999,
"calib/step_q_c_n": 1216.0,
"calib/step_q_gap": 0.003699373543123441,
"calib/step_q_w": 0.9114568764568765,
"calib/step_q_w_n": 858.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2281.0,
"completions/max_terminated_length": 2281.0,
"completions/mean_length": 794.86328125,
"completions/mean_terminated_length": 797.98046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 428.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.0262162946164608,
"kl": 0.005928754806518555,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0118,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.018801795318722725,
"mask/share_reasoning": 0.8657557368278503,
"mask/share_step_conf": 0.11153615266084671,
"num_tokens": 7100289.0,
"reward": 0.7581058740615845,
"reward_std": 0.28139084577560425,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6052922010421753,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": 0.5945133566856384,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.5681447982788086,
"adv/mean_abs_reasoning": 0.54884934425354,
"adv/mean_abs_step_conf": 0.7714293003082275,
"adv/ratio_final_to_reasoning": 1.0351561940034952,
"adv/ratio_step_to_reasoning": 1.4055392584232862,
"adv/std_final_conf": 0.7933010458946228,
"adv/std_reasoning": 0.7930247187614441,
"adv/std_step_conf": 0.9330933094024658,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5047169811320755,
"calib/avg_num_step_conf": 7.83203125,
"calib/ece": 0.42052845528455285,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9959349593495935,
"calib/gap": 0.0008490566037736569,
"calib/mean_conf": 0.9896341463414634,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9891509433962263,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.42052845528455285,
"calib/std_conf": 0.005726515552133605,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9156255625562556,
"calib/step_q_c_n": 1111.0,
"calib/step_q_gap": 0.005949947343727668,
"calib/step_q_w": 0.9096756152125279,
"calib/step_q_w_n": 894.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 1686.0,
"completions/max_terminated_length": 1686.0,
"completions/mean_length": 764.26171875,
"completions/mean_terminated_length": 792.1093139648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.0256,
"grad_norm": 0.025009525939822197,
"kl": 0.007487773895263672,
"learning_rate": 4.888888888888889e-06,
"loss": -0.1196,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01863842085003853,
"mask/share_reasoning": 0.8378759026527405,
"mask/share_step_conf": 0.1083294153213501,
"num_tokens": 7400452.0,
"reward": 0.7031575441360474,
"reward_std": 0.30026620626449585,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5555691123008728,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.550745964050293,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.3911153972148895,
"adv/mean_abs_reasoning": 0.389061838388443,
"adv/mean_abs_step_conf": 0.7705321907997131,
"adv/ratio_final_to_reasoning": 1.0052782324654423,
"adv/ratio_step_to_reasoning": 1.9804877137047983,
"adv/std_final_conf": 0.659353494644165,
"adv/std_reasoning": 0.6613019704818726,
"adv/std_step_conf": 0.9323583841323853,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.1015625,
"calib/ece": 0.3645019920318725,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3645019920318725,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9160833333333332,
"calib/step_q_c_n": 1200.0,
"calib/step_q_gap": -0.00015236460717016342,
"calib/step_q_w": 0.9162356979405034,
"calib/step_q_w_n": 874.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1524.0,
"completions/max_terminated_length": 1524.0,
"completions/mean_length": 728.5859375,
"completions/mean_terminated_length": 743.099609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 403.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.0166942048817873,
"kl": 0.00994873046875,
"learning_rate": 4.861111111111111e-06,
"loss": -0.0865,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019598091021180153,
"mask/share_reasoning": 0.8502811789512634,
"mask/share_step_conf": 0.11058945208787918,
"num_tokens": 7690194.0,
"reward": 0.7656707763671875,
"reward_std": 0.18828433752059937,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6204491853713989,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": 0.5929235219955444,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.39370179176330566,
"adv/mean_abs_reasoning": 0.391094446182251,
"adv/mean_abs_step_conf": 0.7542855739593506,
"adv/ratio_final_to_reasoning": 1.0066667926545796,
"adv/ratio_step_to_reasoning": 1.9286532481411194,
"adv/std_final_conf": 0.6602107882499695,
"adv/std_reasoning": 0.6612504720687866,
"adv/std_step_conf": 0.9315699934959412,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.80859375,
"calib/ece": 0.3457312252964426,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3457312252964426,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9167488076311605,
"calib/step_q_c_n": 1258.0,
"calib/step_q_gap": 0.0005747185623345974,
"calib/step_q_w": 0.9161740890688259,
"calib/step_q_w_n": 741.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2000.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 758.3515625,
"completions/mean_terminated_length": 761.3255615234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 510.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.025107277557253838,
"kl": 0.012006759643554688,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0148,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019063986837863922,
"mask/share_reasoning": 0.8701338171958923,
"mask/share_step_conf": 0.10689593106508255,
"num_tokens": 7989572.0,
"reward": 0.7914588451385498,
"reward_std": 0.19029450416564941,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6436511278152466,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": 0.6142666339874268,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.4913161098957062,
"adv/mean_abs_reasoning": 0.49099820852279663,
"adv/mean_abs_step_conf": 0.7479568719863892,
"adv/ratio_final_to_reasoning": 1.0006474593336419,
"adv/ratio_step_to_reasoning": 1.5233393096008865,
"adv/std_final_conf": 0.7396231889724731,
"adv/std_reasoning": 0.7393175363540649,
"adv/std_step_conf": 0.9338703751564026,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4966216216216216,
"calib/avg_num_step_conf": 8.2109375,
"calib/ece": 0.40716535433070866,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00027027027027037853,
"calib/mean_conf": 0.9898425196850393,
"calib/mu_c": 0.9897297297297296,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40716535433070866,
"calib/std_conf": 0.0025048777512735247,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9176302931596092,
"calib/step_q_c_n": 1228.0,
"calib/step_q_gap": 0.0008911627248265885,
"calib/step_q_w": 0.9167391304347826,
"calib/step_q_w_n": 874.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2656.0,
"completions/max_terminated_length": 2656.0,
"completions/mean_length": 780.6796875,
"completions/mean_terminated_length": 783.7412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 474.0,
"epoch": 0.0288,
"grad_norm": 0.0525457039475441,
"kl": 0.013294219970703125,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0255,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01896524243056774,
"mask/share_reasoning": 0.8649869561195374,
"mask/share_step_conf": 0.11214153468608856,
"num_tokens": 8294642.0,
"reward": 0.7358760833740234,
"reward_std": 0.25070199370384216,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.5862976312637329,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": 0.5713919401168823,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.3187648057937622,
"adv/mean_abs_reasoning": 0.3049775958061218,
"adv/mean_abs_step_conf": 0.7408556342124939,
"adv/ratio_final_to_reasoning": 1.045207287936013,
"adv/ratio_step_to_reasoning": 2.4292133074701834,
"adv/std_final_conf": 0.6184374690055847,
"adv/std_reasoning": 0.6186066269874573,
"adv/std_step_conf": 0.930253803730011,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.18359375,
"calib/ece": 0.2976923076923076,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.9899999999999999,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2976923076923076,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9170390206579956,
"calib/step_q_c_n": 1307.0,
"calib/step_q_gap": 0.007283381560251212,
"calib/step_q_w": 0.9097556390977444,
"calib/step_q_w_n": 532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1604.0,
"completions/max_terminated_length": 1604.0,
"completions/mean_length": 770.26953125,
"completions/mean_terminated_length": 785.6135864257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.02304774895310402,
"kl": 0.012969017028808594,
"learning_rate": 4.777777777777778e-06,
"loss": -0.1212,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.018717553466558456,
"mask/share_reasoning": 0.8605189323425293,
"mask/share_step_conf": 0.10123226046562195,
"num_tokens": 8598775.0,
"reward": 0.815354585647583,
"reward_std": 0.18047857284545898,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.6738097667694092,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.6303368806838989,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.30036962032318115,
"adv/mean_abs_reasoning": 0.2887539565563202,
"adv/mean_abs_step_conf": 0.7428664565086365,
"adv/ratio_final_to_reasoning": 1.0402268557819583,
"adv/ratio_step_to_reasoning": 2.5726624333327313,
"adv/std_final_conf": 0.5961382389068604,
"adv/std_reasoning": 0.5960955619812012,
"adv/std_step_conf": 0.9326738119125366,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.88671875,
"calib/ece": 0.39800000000000013,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39800000000000013,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9174784110535407,
"calib/step_q_c_n": 1158.0,
"calib/step_q_gap": -0.0008607294807219112,
"calib/step_q_w": 0.9183391405342626,
"calib/step_q_w_n": 861.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2421.0,
"completions/max_terminated_length": 2421.0,
"completions/mean_length": 838.57421875,
"completions/mean_terminated_length": 855.2789306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 433.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.024902043864130974,
"kl": 0.012516975402832031,
"learning_rate": 4.75e-06,
"loss": -0.0868,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.017539583146572113,
"mask/share_reasoning": 0.864224374294281,
"mask/share_step_conf": 0.09870478510856628,
"num_tokens": 8920578.0,
"reward": 0.7263821363449097,
"reward_std": 0.16157840192317963,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.5859960317611694,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": 0.555830717086792,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.5582062602043152,
"adv/mean_abs_reasoning": 0.520081102848053,
"adv/mean_abs_step_conf": 0.7749965190887451,
"adv/ratio_final_to_reasoning": 1.073306176954868,
"adv/ratio_step_to_reasoning": 1.4901455077770211,
"adv/std_final_conf": 0.7905355095863342,
"adv/std_reasoning": 0.7754203081130981,
"adv/std_step_conf": 0.9350129961967468,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4897959183673469,
"calib/avg_num_step_conf": 8.23046875,
"calib/ece": 0.39616935483870974,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9879032258064516,
"calib/gap": -0.001836734693877351,
"calib/mean_conf": 0.9889112903225807,
"calib/mu_c": 0.9881632653061224,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.39616935483870974,
"calib/std_conf": 0.009838627048833353,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9200964630225079,
"calib/step_q_c_n": 1244.0,
"calib/step_q_gap": 0.0009608894419749214,
"calib/step_q_w": 0.919135573580533,
"calib/step_q_w_n": 863.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2959.0,
"completions/max_terminated_length": 2959.0,
"completions/mean_length": 820.453125,
"completions/mean_terminated_length": 840.14404296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.032,
"grad_norm": 0.052083615213632584,
"kl": 0.0153656005859375,
"learning_rate": 4.722222222222222e-06,
"loss": -0.1069,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.017525548115372658,
"mask/share_reasoning": 0.8551870584487915,
"mask/share_step_conf": 0.10384992510080338,
"num_tokens": 9237598.0,
"reward": 0.7097210884094238,
"reward_std": 0.25035855174064636,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.5817409753799438,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5306699275970459,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.5183556079864502,
"adv/mean_abs_reasoning": 0.5065758228302002,
"adv/mean_abs_step_conf": 0.7664437294006348,
"adv/ratio_final_to_reasoning": 1.0232537452941146,
"adv/ratio_step_to_reasoning": 1.512989161461699,
"adv/std_final_conf": 0.7564479112625122,
"adv/std_reasoning": 0.7576651573181152,
"adv/std_step_conf": 0.9329509735107422,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5004040948275862,
"calib/avg_num_step_conf": 8.58984375,
"calib/ece": 0.4646721311475409,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9918032786885246,
"calib/gap": 7.273706896571142e-05,
"calib/mean_conf": 0.9892622950819672,
"calib/mu_c": 0.9892968750000002,
"calib/mu_w": 0.9892241379310345,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4646721311475409,
"calib/std_conf": 0.008114754098360653,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9168671454219032,
"calib/step_q_c_n": 1114.0,
"calib/step_q_gap": -0.004450826928327323,
"calib/step_q_w": 0.9213179723502305,
"calib/step_q_w_n": 1085.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2684.0,
"completions/max_terminated_length": 2684.0,
"completions/mean_length": 832.390625,
"completions/mean_terminated_length": 859.2418823242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 464.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.025735603645443916,
"kl": 0.018184661865234375,
"learning_rate": 4.694444444444445e-06,
"loss": -0.1554,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.017364859580993652,
"mask/share_reasoning": 0.847489595413208,
"mask/share_step_conf": 0.10389558970928192,
"num_tokens": 9556602.0,
"reward": 0.6444910168647766,
"reward_std": 0.24960076808929443,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5095929503440857,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.48876410722732544,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.3183901906013489,
"adv/mean_abs_reasoning": 0.30197247862815857,
"adv/mean_abs_step_conf": 0.7719812989234924,
"adv/ratio_final_to_reasoning": 1.054368239277218,
"adv/ratio_step_to_reasoning": 2.556462438002806,
"adv/std_final_conf": 0.5945860147476196,
"adv/std_reasoning": 0.5727756023406982,
"adv/std_step_conf": 0.9287157654762268,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5103276353276354,
"calib/avg_num_step_conf": 8.21875,
"calib/ece": 0.41924302788844625,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": 0.0009447034447033209,
"calib/mean_conf": 0.9889641434262949,
"calib/mu_c": 0.9893706293706291,
"calib/mu_w": 0.9884259259259258,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41924302788844625,
"calib/std_conf": 0.008730282802526973,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.920017316017316,
"calib/step_q_c_n": 1155.0,
"calib/step_q_gap": 3.8390832911239237e-05,
"calib/step_q_w": 0.9199789251844047,
"calib/step_q_w_n": 949.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1755.0,
"completions/max_terminated_length": 1755.0,
"completions/mean_length": 782.78125,
"completions/mean_terminated_length": 798.37451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.04063679277896881,
"kl": 0.023967742919921875,
"learning_rate": 4.666666666666667e-06,
"loss": -0.0673,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.018448833376169205,
"mask/share_reasoning": 0.8532680869102478,
"mask/share_step_conf": 0.10875185579061508,
"num_tokens": 9863698.0,
"reward": 0.7091785073280334,
"reward_std": 0.1623982936143875,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5681651830673218,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.5423792600631714,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.4482853412628174,
"adv/mean_abs_reasoning": 0.4236627519130707,
"adv/mean_abs_step_conf": 0.7881948947906494,
"adv/ratio_final_to_reasoning": 1.05811837183836,
"adv/ratio_step_to_reasoning": 1.8604300029481358,
"adv/std_final_conf": 0.7025040984153748,
"adv/std_reasoning": 0.6816445589065552,
"adv/std_step_conf": 0.9337459206581116,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5040322580645161,
"calib/avg_num_step_conf": 7.80859375,
"calib/ece": 0.47783464566929135,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9960629921259843,
"calib/gap": 0.0007258064516131313,
"calib/mean_conf": 0.9896456692913386,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.989274193548387,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.47783464566929135,
"calib/std_conf": 0.005635974940365422,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.92001953125,
"calib/step_q_c_n": 1024.0,
"calib/step_q_gap": 0.00918876201923069,
"calib/step_q_w": 0.9108307692307693,
"calib/step_q_w_n": 975.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1852.0,
"completions/max_terminated_length": 1852.0,
"completions/mean_length": 764.6796875,
"completions/mean_terminated_length": 767.678466796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.0352,
"grad_norm": 0.061604708433151245,
"kl": 0.03219795227050781,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0305,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.01927250623703003,
"mask/share_reasoning": 0.8669767379760742,
"mask/share_step_conf": 0.10984449833631516,
"num_tokens": 10166328.0,
"reward": 0.6816341876983643,
"reward_std": 0.23087355494499207,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5180652141571045,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": 0.5452032089233398,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.5265980362892151,
"adv/mean_abs_reasoning": 0.520954966545105,
"adv/mean_abs_step_conf": 0.7565268874168396,
"adv/ratio_final_to_reasoning": 1.0108321642109184,
"adv/ratio_step_to_reasoning": 1.4521924849551051,
"adv/std_final_conf": 0.7767467498779297,
"adv/std_reasoning": 0.7753825783729553,
"adv/std_step_conf": 0.9345172643661499,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5070441079657669,
"calib/avg_num_step_conf": 8.34765625,
"calib/ece": 0.376600790513834,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9920948616600791,
"calib/gap": 0.0017722185648452182,
"calib/mean_conf": 0.9892490118577075,
"calib/mu_c": 0.9899354838709677,
"calib/mu_w": 0.9881632653061225,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.376600790513834,
"calib/std_conf": 0.007991426298258135,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9212392156862744,
"calib/step_q_c_n": 1275.0,
"calib/step_q_gap": 0.0017728583776897011,
"calib/step_q_w": 0.9194663573085847,
"calib/step_q_w_n": 862.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2293.0,
"completions/max_terminated_length": 2293.0,
"completions/mean_length": 741.03125,
"completions/mean_terminated_length": 746.8661499023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 445.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.03941137716174126,
"kl": 0.0266265869140625,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0149,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019667698070406914,
"mask/share_reasoning": 0.8525025844573975,
"mask/share_step_conf": 0.12001723051071167,
"num_tokens": 10461144.0,
"reward": 0.775858998298645,
"reward_std": 0.28364232182502747,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6142761707305908,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": 0.6194730997085571,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.5497709512710571,
"adv/mean_abs_reasoning": 0.4924013018608093,
"adv/mean_abs_step_conf": 0.7616751194000244,
"adv/ratio_final_to_reasoning": 1.1165099466501105,
"adv/ratio_step_to_reasoning": 1.546858459800199,
"adv/std_final_conf": 0.7888799905776978,
"adv/std_reasoning": 0.7575621008872986,
"adv/std_step_conf": 0.9331747889518738,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5073451910408432,
"calib/avg_num_step_conf": 7.83203125,
"calib/ece": 0.4517813765182185,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9595141700404858,
"calib/gap": 0.00166534914360994,
"calib/mean_conf": 0.9861943319838056,
"calib/mu_c": 0.9869696969696969,
"calib/mu_w": 0.985304347826087,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4517813765182185,
"calib/std_conf": 0.01788658286023639,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9200578592092575,
"calib/step_q_c_n": 1037.0,
"calib/step_q_gap": 0.001845049291902101,
"calib/step_q_w": 0.9182128099173554,
"calib/step_q_w_n": 968.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2882.0,
"completions/max_terminated_length": 2882.0,
"completions/mean_length": 801.28515625,
"completions/mean_terminated_length": 823.8112182617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 419.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.04921026900410652,
"kl": 0.024835586547851562,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0897,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01787012442946434,
"mask/share_reasoning": 0.8555195927619934,
"mask/share_step_conf": 0.09926652163267136,
"num_tokens": 10775529.0,
"reward": 0.6645053625106812,
"reward_std": 0.22534745931625366,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5283355116844177,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.5045813322067261,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.3687620759010315,
"adv/mean_abs_reasoning": 0.3481977880001068,
"adv/mean_abs_step_conf": 0.796619176864624,
"adv/ratio_final_to_reasoning": 1.059059214646471,
"adv/ratio_step_to_reasoning": 2.287835260068854,
"adv/std_final_conf": 0.6157107949256897,
"adv/std_reasoning": 0.5961853265762329,
"adv/std_step_conf": 0.930061399936676,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5065135895032803,
"calib/avg_num_step_conf": 8.3046875,
"calib/ece": 0.2101606425702811,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9919678714859438,
"calib/gap": 0.0011724461105901973,
"calib/mean_conf": 0.989277108433735,
"calib/mu_c": 0.989536082474227,
"calib/mu_w": 0.9883636363636368,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2101606425702811,
"calib/std_conf": 0.008033534013575733,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9219170673076924,
"calib/step_q_c_n": 1664.0,
"calib/step_q_gap": 0.006700617091242256,
"calib/step_q_w": 0.9152164502164502,
"calib/step_q_w_n": 462.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2545.0,
"completions/max_terminated_length": 2545.0,
"completions/mean_length": 774.19140625,
"completions/mean_terminated_length": 780.2874145507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 358.0,
"epoch": 0.0384,
"grad_norm": 0.038044821470975876,
"kl": 0.027385711669921875,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0374,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019639793783426285,
"mask/share_reasoning": 0.8501439690589905,
"mask/share_step_conf": 0.12240374088287354,
"num_tokens": 11076434.0,
"reward": 0.9108313918113708,
"reward_std": 0.1632777750492096,
"rewards/accuracy_reward_step": 0.7578125,
"rewards/final_brier_reward_step": 0.7626378536224365,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.7129310965538025,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.41283056139945984,
"adv/mean_abs_reasoning": 0.40489670634269714,
"adv/mean_abs_step_conf": 0.7636315822601318,
"adv/ratio_final_to_reasoning": 1.0195947631395341,
"adv/ratio_step_to_reasoning": 1.8859910942664178,
"adv/std_final_conf": 0.6809049248695374,
"adv/std_reasoning": 0.6816288828849792,
"adv/std_step_conf": 0.9321484565734863,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5079365079365079,
"calib/avg_num_step_conf": 8.23046875,
"calib/ece": 0.48968000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.992,
"calib/gap": 0.008571428571428563,
"calib/mean_conf": 0.9856800000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9814285714285713,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.48968000000000006,
"calib/std_conf": 0.06272270402334389,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9196353166986564,
"calib/step_q_c_n": 1042.0,
"calib/step_q_gap": 0.012273814351238532,
"calib/step_q_w": 0.9073615023474179,
"calib/step_q_w_n": 1065.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2511.0,
"completions/max_terminated_length": 2511.0,
"completions/mean_length": 763.76171875,
"completions/mean_terminated_length": 778.9761352539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.022712556645274162,
"kl": 0.025478363037109375,
"learning_rate": 4.527777777777778e-06,
"loss": -0.1116,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.019015856087207794,
"mask/share_reasoning": 0.8509063720703125,
"mask/share_step_conf": 0.11054646223783493,
"num_tokens": 11379053.0,
"reward": 0.6553043723106384,
"reward_std": 0.20234841108322144,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.49861401319503784,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": 0.5198071599006653,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.5180400609970093,
"adv/mean_abs_reasoning": 0.513764500617981,
"adv/mean_abs_step_conf": 0.7800165414810181,
"adv/ratio_final_to_reasoning": 1.0083220237557975,
"adv/ratio_step_to_reasoning": 1.5182375203868235,
"adv/std_final_conf": 0.7381928563117981,
"adv/std_reasoning": 0.7394514679908752,
"adv/std_step_conf": 0.9326812624931335,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.505,
"calib/avg_num_step_conf": 8.0859375,
"calib/ece": 0.3996721311475411,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.000400000000000178,
"calib/mean_conf": 0.9898360655737706,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9895999999999998,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3996721311475411,
"calib/std_conf": 0.0025554847980524434,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9227439532944121,
"calib/step_q_c_n": 1199.0,
"calib/step_q_gap": 0.013857615751358132,
"calib/step_q_w": 0.9088863375430539,
"calib/step_q_w_n": 871.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2602.0,
"completions/max_terminated_length": 2602.0,
"completions/mean_length": 754.5,
"completions/mean_terminated_length": 775.7108154296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.022115031257271767,
"kl": 0.029575347900390625,
"learning_rate": 4.5e-06,
"loss": -0.0876,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01870632730424404,
"mask/share_reasoning": 0.8444530963897705,
"mask/share_step_conf": 0.10949686914682388,
"num_tokens": 11679093.0,
"reward": 0.710079550743103,
"reward_std": 0.23920077085494995,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5704425573348999,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.5473726987838745,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.4728773832321167,
"adv/mean_abs_reasoning": 0.47525712847709656,
"adv/mean_abs_step_conf": 0.7562281489372253,
"adv/ratio_final_to_reasoning": 0.9949927205666426,
"adv/ratio_step_to_reasoning": 1.5911979087204144,
"adv/std_final_conf": 0.7385199069976807,
"adv/std_reasoning": 0.7392476797103882,
"adv/std_step_conf": 0.9321252107620239,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.504424778761062,
"calib/avg_num_step_conf": 8.3671875,
"calib/ece": 0.4398406374501994,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": 0.0007964601769909763,
"calib/mean_conf": 0.9896414342629484,
"calib/mu_c": 0.99,
"calib/mu_w": 0.989203539823009,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4398406374501994,
"calib/std_conf": 0.005669422099903468,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.924746835443038,
"calib/step_q_c_n": 1106.0,
"calib/step_q_gap": 0.0039360246322271975,
"calib/step_q_w": 0.9208108108108108,
"calib/step_q_w_n": 1036.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2555.0,
"completions/max_terminated_length": 2555.0,
"completions/mean_length": 751.0,
"completions/mean_terminated_length": 759.9051513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.0416,
"grad_norm": 0.028168994933366776,
"kl": 0.02768707275390625,
"learning_rate": 4.472222222222223e-06,
"loss": -0.002,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019812334328889847,
"mask/share_reasoning": 0.8540918231010437,
"mask/share_step_conf": 0.11437710374593735,
"num_tokens": 11977437.0,
"reward": 0.7053453922271729,
"reward_std": 0.22924397885799408,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5484570264816284,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.5583274960517883,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.4358151853084564,
"adv/mean_abs_reasoning": 0.402193546295166,
"adv/mean_abs_step_conf": 0.7792983651161194,
"adv/ratio_final_to_reasoning": 1.083595670102115,
"adv/ratio_step_to_reasoning": 1.9376202634146689,
"adv/std_final_conf": 0.6807341575622559,
"adv/std_reasoning": 0.6614307761192322,
"adv/std_step_conf": 0.9320389032363892,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49264705882352944,
"calib/avg_num_step_conf": 7.3125,
"calib/ece": 0.4477689243027889,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": -0.0007352941176469452,
"calib/mean_conf": 0.989601593625498,
"calib/mu_c": 0.989264705882353,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4477689243027889,
"calib/std_conf": 0.005701806298877533,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9225623130608175,
"calib/step_q_c_n": 1003.0,
"calib/step_q_gap": -0.0017760068471226464,
"calib/step_q_w": 0.9243383199079401,
"calib/step_q_w_n": 869.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2602.0,
"completions/max_terminated_length": 2602.0,
"completions/mean_length": 793.5703125,
"completions/mean_terminated_length": 796.682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.02842039056122303,
"kl": 0.029073715209960938,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0205,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.018899505957961082,
"mask/share_reasoning": 0.874600350856781,
"mask/share_step_conf": 0.10259392857551575,
"num_tokens": 12287351.0,
"reward": 0.6714740991592407,
"reward_std": 0.22889409959316254,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5361906290054321,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": 0.5051949620246887,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.3478602468967438,
"adv/mean_abs_reasoning": 0.34448909759521484,
"adv/mean_abs_step_conf": 0.7755715847015381,
"adv/ratio_final_to_reasoning": 1.0097859390182795,
"adv/ratio_step_to_reasoning": 2.2513675762617553,
"adv/std_final_conf": 0.619642972946167,
"adv/std_reasoning": 0.6185486912727356,
"adv/std_step_conf": 0.932489812374115,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5222222222222221,
"calib/avg_num_step_conf": 8.3125,
"calib/ece": 0.16596078431372552,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.996078431372549,
"calib/gap": 0.0028888888888887188,
"calib/mean_conf": 0.9894901960784314,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9871111111111113,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16596078431372552,
"calib/std_conf": 0.006146488074325677,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.925497175141243,
"calib/step_q_c_n": 1770.0,
"calib/step_q_gap": 0.003206672347946804,
"calib/step_q_w": 0.9222905027932962,
"calib/step_q_w_n": 358.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1446.0,
"completions/max_terminated_length": 1446.0,
"completions/mean_length": 701.359375,
"completions/mean_terminated_length": 704.10986328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 365.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.018630625680088997,
"kl": 0.037181854248046875,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0122,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.021096795797348022,
"mask/share_reasoning": 0.848509669303894,
"mask/share_step_conf": 0.12648731470108032,
"num_tokens": 12574147.0,
"reward": 0.9390561580657959,
"reward_std": 0.19271451234817505,
"rewards/accuracy_reward_step": 0.8203125,
"rewards/final_brier_reward_step": 0.82079017162323,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": 0.6948220729827881,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.3249935507774353,
"adv/mean_abs_reasoning": 0.3224637508392334,
"adv/mean_abs_step_conf": 0.7264145612716675,
"adv/ratio_final_to_reasoning": 1.0078452227005916,
"adv/ratio_step_to_reasoning": 2.2527014567718857,
"adv/std_final_conf": 0.6402071118354797,
"adv/std_reasoning": 0.6401504278182983,
"adv/std_step_conf": 0.93314129114151,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.91015625,
"calib/ece": 0.3900000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3900000000000001,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.922301854974705,
"calib/step_q_c_n": 1186.0,
"calib/step_q_gap": -0.003657620591445254,
"calib/step_q_w": 0.9259594755661502,
"calib/step_q_w_n": 839.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1353.0,
"completions/max_terminated_length": 1353.0,
"completions/mean_length": 665.9765625,
"completions/mean_terminated_length": 676.5476684570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 382.0,
"epoch": 0.0448,
"grad_norm": 0.023608466610312462,
"kl": 0.048583984375,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0291,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.021571829915046692,
"mask/share_reasoning": 0.8393821120262146,
"mask/share_step_conf": 0.1234210804104805,
"num_tokens": 12849005.0,
"reward": 0.7322165966033936,
"reward_std": 0.18012070655822754,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.5936523079872131,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": 0.5582807660102844,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.4121657609939575,
"adv/mean_abs_reasoning": 0.4092313051223755,
"adv/mean_abs_step_conf": 0.7901467084884644,
"adv/ratio_final_to_reasoning": 1.007170653454052,
"adv/ratio_step_to_reasoning": 1.9308070975952851,
"adv/std_final_conf": 0.661336362361908,
"adv/std_reasoning": 0.6613420248031616,
"adv/std_step_conf": 0.9343063235282898,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.421875,
"calib/ece": 0.3062055335968379,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3062055335968379,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9209961977186313,
"calib/step_q_c_n": 1315.0,
"calib/step_q_gap": -0.0018585031360693893,
"calib/step_q_w": 0.9228547008547007,
"calib/step_q_w_n": 585.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2491.0,
"completions/max_terminated_length": 2491.0,
"completions/mean_length": 728.42578125,
"completions/mean_terminated_length": 734.1614379882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 412.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.012838178314268589,
"kl": 0.032314300537109375,
"learning_rate": 4.361111111111112e-06,
"loss": -0.003,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02045440673828125,
"mask/share_reasoning": 0.8614332675933838,
"mask/share_step_conf": 0.11029988527297974,
"num_tokens": 13140706.0,
"reward": 0.8210978507995605,
"reward_std": 0.21742019057273865,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.6819324493408203,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": 0.6274506449699402,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.42065298557281494,
"adv/mean_abs_reasoning": 0.40963807702064514,
"adv/mean_abs_step_conf": 0.7803975343704224,
"adv/ratio_final_to_reasoning": 1.0268893669072043,
"adv/ratio_step_to_reasoning": 1.9050903178883234,
"adv/std_final_conf": 0.68174147605896,
"adv/std_reasoning": 0.6816533803939819,
"adv/std_step_conf": 0.932420551776886,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.41015625,
"calib/ece": 0.4701587301587302,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4701587301587302,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.923446735395189,
"calib/step_q_c_n": 970.0,
"calib/step_q_gap": -0.0008898341840990476,
"calib/step_q_w": 0.924336569579288,
"calib/step_q_w_n": 927.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2264.0,
"completions/max_terminated_length": 2264.0,
"completions/mean_length": 755.4453125,
"completions/mean_terminated_length": 761.3936767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 377.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.019129678606987,
"kl": 0.03778839111328125,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0373,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.019462086260318756,
"mask/share_reasoning": 0.8706867098808289,
"mask/share_step_conf": 0.10203869640827179,
"num_tokens": 13440420.0,
"reward": 0.6606093049049377,
"reward_std": 0.21581587195396423,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5209956765174866,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.5010042190551758,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.5471866130828857,
"adv/mean_abs_reasoning": 0.5119894742965698,
"adv/mean_abs_step_conf": 0.7427734136581421,
"adv/ratio_final_to_reasoning": 1.0687458249696906,
"adv/ratio_step_to_reasoning": 1.4507591482786826,
"adv/std_final_conf": 0.8045254349708557,
"adv/std_reasoning": 0.792806088924408,
"adv/std_step_conf": 0.9351861476898193,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.496551724137931,
"calib/avg_num_step_conf": 8.1328125,
"calib/ece": 0.4098400000000002,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002758620689655711,
"calib/mean_conf": 0.9898400000000002,
"calib/mu_c": 0.9897241379310344,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4098400000000002,
"calib/std_conf": 0.002524757414089521,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.926106346483705,
"calib/step_q_c_n": 1166.0,
"calib/step_q_gap": 0.0009753421168928744,
"calib/step_q_w": 0.9251310043668122,
"calib/step_q_w_n": 916.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2727.0,
"completions/max_terminated_length": 2727.0,
"completions/mean_length": 745.421875,
"completions/mean_terminated_length": 754.2609252929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 318.0,
"epoch": 0.048,
"grad_norm": 0.02074560336768627,
"kl": 0.037052154541015625,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0306,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.01985202543437481,
"mask/share_reasoning": 0.8520389795303345,
"mask/share_step_conf": 0.11639025807380676,
"num_tokens": 13736296.0,
"reward": 0.7014477252960205,
"reward_std": 0.27716386318206787,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5705187320709229,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.5253453254699707,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.41354337334632874,
"adv/mean_abs_reasoning": 0.4040815830230713,
"adv/mean_abs_step_conf": 0.7503899931907654,
"adv/ratio_final_to_reasoning": 1.0234155445850082,
"adv/ratio_step_to_reasoning": 1.8570259688076936,
"adv/std_final_conf": 0.6817355751991272,
"adv/std_reasoning": 0.6816933155059814,
"adv/std_step_conf": 0.9333223104476929,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.52734375,
"calib/ece": 0.46346938775510205,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46346938775510205,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9234462151394424,
"calib/step_q_c_n": 1004.0,
"calib/step_q_gap": 0.007498219473136891,
"calib/step_q_w": 0.9159479956663055,
"calib/step_q_w_n": 923.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2834.0,
"completions/max_terminated_length": 2834.0,
"completions/mean_length": 798.78125,
"completions/mean_terminated_length": 805.0708618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.0159748587757349,
"kl": 0.03411674499511719,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0446,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019596949219703674,
"mask/share_reasoning": 0.8614305257797241,
"mask/share_step_conf": 0.11116001754999161,
"num_tokens": 14045552.0,
"reward": 0.6591576337814331,
"reward_std": 0.22060437500476837,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5128730535507202,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.5132546424865723,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.44636037945747375,
"adv/mean_abs_reasoning": 0.43221962451934814,
"adv/mean_abs_step_conf": 0.7530901432037354,
"adv/ratio_final_to_reasoning": 1.0327165962301015,
"adv/ratio_step_to_reasoning": 1.7423784124592048,
"adv/std_final_conf": 0.700847864151001,
"adv/std_reasoning": 0.7015045881271362,
"adv/std_step_conf": 0.9329828023910522,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.26171875,
"calib/ece": 0.34510204081632656,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34510204081632656,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9173498233215548,
"calib/step_q_c_n": 1132.0,
"calib/step_q_gap": 0.0004997545457638974,
"calib/step_q_w": 0.9168500687757909,
"calib/step_q_w_n": 727.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2576.0,
"completions/max_terminated_length": 2576.0,
"completions/mean_length": 771.3671875,
"completions/mean_terminated_length": 783.6111450195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.014016284607350826,
"kl": 0.03520965576171875,
"learning_rate": 4.25e-06,
"loss": -0.0716,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.019007964059710503,
"mask/share_reasoning": 0.8643789291381836,
"mask/share_step_conf": 0.10098807513713837,
"num_tokens": 14348998.0,
"reward": 0.7666846513748169,
"reward_std": 0.21894043684005737,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6238887310028076,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.5946369171142578,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.5394688844680786,
"adv/mean_abs_reasoning": 0.5332775115966797,
"adv/mean_abs_step_conf": 0.7809452414512634,
"adv/ratio_final_to_reasoning": 1.011610039307417,
"adv/ratio_step_to_reasoning": 1.4644256029342861,
"adv/std_final_conf": 0.7751846313476562,
"adv/std_reasoning": 0.7754152417182922,
"adv/std_step_conf": 0.9337522983551025,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5043478260869565,
"calib/avg_num_step_conf": 7.7109375,
"calib/ece": 0.45168674698795186,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0003478260869566041,
"calib/mean_conf": 0.989839357429719,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9896521739130434,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.45168674698795186,
"calib/std_conf": 0.0025298017265901426,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.921573986804901,
"calib/step_q_c_n": 1061.0,
"calib/step_q_gap": 0.0020449616132252046,
"calib/step_q_w": 0.9195290251916758,
"calib/step_q_w_n": 913.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2452.0,
"completions/max_terminated_length": 2452.0,
"completions/mean_length": 749.64453125,
"completions/mean_terminated_length": 755.5472412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.0512,
"grad_norm": 0.01471987459808588,
"kl": 0.03924560546875,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0151,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.020167170092463493,
"mask/share_reasoning": 0.8575121164321899,
"mask/share_step_conf": 0.1145082637667656,
"num_tokens": 14644595.0,
"reward": 0.6888903379440308,
"reward_std": 0.2654000520706177,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5326277017593384,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.5459342002868652,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.5216391086578369,
"adv/mean_abs_reasoning": 0.5024067163467407,
"adv/mean_abs_step_conf": 0.749990701675415,
"adv/ratio_final_to_reasoning": 1.0382805238969433,
"adv/ratio_step_to_reasoning": 1.492795930613718,
"adv/std_final_conf": 0.7634592056274414,
"adv/std_reasoning": 0.7575902938842773,
"adv/std_step_conf": 0.9341700673103333,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.49693251533742333,
"calib/avg_num_step_conf": 7.57421875,
"calib/ece": 0.32999999999999996,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00012269938650288914,
"calib/mean_conf": 0.9899190283400809,
"calib/mu_c": 0.9898773006134969,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32999999999999996,
"calib/std_conf": 0.0012699908616484326,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9207016191210486,
"calib/step_q_c_n": 1297.0,
"calib/step_q_gap": 0.003723425974631156,
"calib/step_q_w": 0.9169781931464175,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2750.0,
"completions/max_terminated_length": 2750.0,
"completions/mean_length": 694.75390625,
"completions/mean_terminated_length": 708.5936279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 414.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.01923024281859398,
"kl": 0.040454864501953125,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0531,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.020560309290885925,
"mask/share_reasoning": 0.8439966440200806,
"mask/share_step_conf": 0.11591173708438873,
"num_tokens": 14926988.0,
"reward": 0.769041895866394,
"reward_std": 0.25200900435447693,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6431816220283508,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.5745896100997925,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.4636262357234955,
"adv/mean_abs_reasoning": 0.46020743250846863,
"adv/mean_abs_step_conf": 0.761410117149353,
"adv/ratio_final_to_reasoning": 1.007428830943455,
"adv/ratio_step_to_reasoning": 1.6544933075050712,
"adv/std_final_conf": 0.7198258638381958,
"adv/std_reasoning": 0.7206044793128967,
"adv/std_step_conf": 0.932525634765625,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.796875,
"calib/ece": 0.32467741935483885,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32467741935483885,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.92359756097561,
"calib/step_q_c_n": 1312.0,
"calib/step_q_gap": 0.0011121808586508353,
"calib/step_q_w": 0.9224853801169591,
"calib/step_q_w_n": 684.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1916.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 730.42578125,
"completions/mean_terminated_length": 733.2902221679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 328.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.013666275888681412,
"kl": 0.03583526611328125,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0225,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.020532866939902306,
"mask/share_reasoning": 0.8563636541366577,
"mask/share_step_conf": 0.11919719725847244,
"num_tokens": 15219337.0,
"reward": 0.7715030908584595,
"reward_std": 0.22498729825019836,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.6508409976959229,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.570290207862854,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.3940299153327942,
"adv/mean_abs_reasoning": 0.3988789916038513,
"adv/mean_abs_step_conf": 0.7379977107048035,
"adv/ratio_final_to_reasoning": 0.9878432397465721,
"adv/ratio_step_to_reasoning": 1.850179443488339,
"adv/std_final_conf": 0.682790994644165,
"adv/std_reasoning": 0.6815266609191895,
"adv/std_step_conf": 0.9336446523666382,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5053763440860215,
"calib/avg_num_step_conf": 7.89453125,
"calib/ece": 0.3574308300395257,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00043010752688188436,
"calib/mean_conf": 0.9898418972332016,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9895698924731181,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3574308300395257,
"calib/std_conf": 0.0025098036152391397,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9203018575851395,
"calib/step_q_c_n": 1292.0,
"calib/step_q_gap": 0.007160568147553725,
"calib/step_q_w": 0.9131412894375858,
"calib/step_q_w_n": 729.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2489.0,
"completions/max_terminated_length": 2489.0,
"completions/mean_length": 708.29296875,
"completions/mean_terminated_length": 711.0706176757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 294.0,
"epoch": 0.0544,
"grad_norm": 0.016280025243759155,
"kl": 0.03824615478515625,
"learning_rate": 4.138888888888889e-06,
"loss": 0.0003,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.021065454930067062,
"mask/share_reasoning": 0.8582796454429626,
"mask/share_step_conf": 0.11674864590167999,
"num_tokens": 15509956.0,
"reward": 0.7576898336410522,
"reward_std": 0.21025392413139343,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6285640597343445,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": 0.5649406909942627,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.4902188777923584,
"adv/mean_abs_reasoning": 0.4728948175907135,
"adv/mean_abs_step_conf": 0.7851869463920593,
"adv/ratio_final_to_reasoning": 1.0366340665137903,
"adv/ratio_step_to_reasoning": 1.6603839103003917,
"adv/std_final_conf": 0.7273574471473694,
"adv/std_reasoning": 0.7206883430480957,
"adv/std_step_conf": 0.9345096349716187,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.49411764705882355,
"calib/avg_num_step_conf": 7.3046875,
"calib/ece": 0.3124701195219124,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0003529411764705559,
"calib/mean_conf": 0.9897609561752988,
"calib/mu_c": 0.9896470588235294,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3124701195219124,
"calib/std_conf": 0.0028126474254538338,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9195335029686176,
"calib/step_q_c_n": 1179.0,
"calib/step_q_gap": -0.004185744498820809,
"calib/step_q_w": 0.9237192474674384,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1641.0,
"completions/max_terminated_length": 1641.0,
"completions/mean_length": 724.32421875,
"completions/mean_terminated_length": 732.9130859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.01628994569182396,
"kl": 0.036128997802734375,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0324,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.02012309432029724,
"mask/share_reasoning": 0.8632853031158447,
"mask/share_step_conf": 0.10487289726734161,
"num_tokens": 15803335.0,
"reward": 0.8191436529159546,
"reward_std": 0.2507874369621277,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6702799797058105,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6391010284423828,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.36346209049224854,
"adv/mean_abs_reasoning": 0.3581864535808563,
"adv/mean_abs_step_conf": 0.7379522323608398,
"adv/ratio_final_to_reasoning": 1.0147287449278182,
"adv/ratio_step_to_reasoning": 2.0602460673300027,
"adv/std_final_conf": 0.641384482383728,
"adv/std_reasoning": 0.6402559280395508,
"adv/std_step_conf": 0.9335207939147949,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.1640625,
"calib/ece": 0.2813385826771654,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2813385826771654,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9206836248012719,
"calib/step_q_c_n": 1258.0,
"calib/step_q_gap": 0.0044857081346051375,
"calib/step_q_w": 0.9161979166666667,
"calib/step_q_w_n": 576.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1557.0,
"completions/max_terminated_length": 1557.0,
"completions/mean_length": 723.734375,
"completions/mean_terminated_length": 726.5725708007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.014305333606898785,
"kl": 0.03975677490234375,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0329,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.020240100100636482,
"mask/share_reasoning": 0.8698899745941162,
"mask/share_step_conf": 0.10596367716789246,
"num_tokens": 16094435.0,
"reward": 0.8506057262420654,
"reward_std": 0.1964026689529419,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7088069915771484,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": 0.6533418893814087,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.3542019724845886,
"adv/mean_abs_reasoning": 0.3400605022907257,
"adv/mean_abs_step_conf": 0.7365595102310181,
"adv/ratio_final_to_reasoning": 1.0415851594013499,
"adv/ratio_step_to_reasoning": 2.165966071535459,
"adv/std_final_conf": 0.6514297723770142,
"adv/std_reasoning": 0.6401675939559937,
"adv/std_step_conf": 0.9333468079566956,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4973544973544973,
"calib/avg_num_step_conf": 8.0,
"calib/ece": 0.24578740157480317,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000158730158730358,
"calib/mean_conf": 0.9898818897637796,
"calib/mu_c": 0.9898412698412697,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.24578740157480317,
"calib/std_conf": 0.0018786583134551434,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9230980392156862,
"calib/step_q_c_n": 1530.0,
"calib/step_q_gap": 0.011823907941554967,
"calib/step_q_w": 0.9112741312741313,
"calib/step_q_w_n": 518.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1606.0,
"completions/max_terminated_length": 1606.0,
"completions/mean_length": 687.51953125,
"completions/mean_terminated_length": 690.2156982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.0576,
"grad_norm": 0.019337039440870285,
"kl": 0.040557861328125,
"learning_rate": 4.055555555555556e-06,
"loss": -0.0213,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.021507475525140762,
"mask/share_reasoning": 0.851201593875885,
"mask/share_step_conf": 0.12338472157716751,
"num_tokens": 16376672.0,
"reward": 0.8689508438110352,
"reward_std": 0.18806323409080505,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.7393484115600586,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": 0.6532406806945801,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.38243305683135986,
"adv/mean_abs_reasoning": 0.356126606464386,
"adv/mean_abs_step_conf": 0.7784011363983154,
"adv/ratio_final_to_reasoning": 1.0738682532825714,
"adv/ratio_step_to_reasoning": 2.1857427169687154,
"adv/std_final_conf": 0.6601985096931458,
"adv/std_reasoning": 0.6402488350868225,
"adv/std_step_conf": 0.9331528544425964,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5089285714285714,
"calib/avg_num_step_conf": 7.765625,
"calib/ece": 0.4297619047619048,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9920634920634921,
"calib/gap": 0.010535714285714093,
"calib/mean_conf": 0.9853174603174604,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9794642857142859,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4297619047619048,
"calib/std_conf": 0.06332940473951357,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9225273722627737,
"calib/step_q_c_n": 1096.0,
"calib/step_q_gap": 0.011989255670845411,
"calib/step_q_w": 0.9105381165919283,
"calib/step_q_w_n": 892.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2534.0,
"completions/max_terminated_length": 2534.0,
"completions/mean_length": 702.04296875,
"completions/mean_terminated_length": 704.796142578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.02334001660346985,
"kl": 0.041500091552734375,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0285,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.021185822784900665,
"mask/share_reasoning": 0.8552565574645996,
"mask/share_step_conf": 0.1196514219045639,
"num_tokens": 16664219.0,
"reward": 0.7132257223129272,
"reward_std": 0.1884762942790985,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5606836080551147,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": 0.5595178604125977,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.43028029799461365,
"adv/mean_abs_reasoning": 0.42309921979904175,
"adv/mean_abs_step_conf": 0.7558045387268066,
"adv/ratio_final_to_reasoning": 1.0169725630762985,
"adv/ratio_step_to_reasoning": 1.7863529483363003,
"adv/std_final_conf": 0.7008323669433594,
"adv/std_reasoning": 0.7013556957244873,
"adv/std_step_conf": 0.9343731999397278,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.3984375,
"calib/ece": 0.41570281124498,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.220446049250313e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41570281124498,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9252973977695167,
"calib/step_q_c_n": 1076.0,
"calib/step_q_gap": 0.007473436889321117,
"calib/step_q_w": 0.9178239608801956,
"calib/step_q_w_n": 818.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2009.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 735.5078125,
"completions/mean_terminated_length": 741.2991943359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 399.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.015703028067946434,
"kl": 0.039569854736328125,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0043,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02014715038239956,
"mask/share_reasoning": 0.8610376119613647,
"mask/share_step_conf": 0.11100269854068756,
"num_tokens": 16959349.0,
"reward": 0.7191253304481506,
"reward_std": 0.2070513367652893,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5667777061462402,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.565222978591919,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.4444199204444885,
"adv/mean_abs_reasoning": 0.43316081166267395,
"adv/mean_abs_step_conf": 0.7824656963348389,
"adv/ratio_final_to_reasoning": 1.0259929072036706,
"adv/ratio_step_to_reasoning": 1.8064092486376349,
"adv/std_final_conf": 0.7026104927062988,
"adv/std_reasoning": 0.7014220952987671,
"adv/std_step_conf": 0.9334197640419006,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.55078125,
"calib/ece": 0.3034920634920636,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 3.3306690738754696e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999995,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3034920634920636,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9256273199703045,
"calib/step_q_c_n": 1347.0,
"calib/step_q_gap": 0.005951552052215758,
"calib/step_q_w": 0.9196757679180887,
"calib/step_q_w_n": 586.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 740.6171875,
"completions/mean_terminated_length": 743.5216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 387.0,
"epoch": 0.0608,
"grad_norm": 0.015545027330517769,
"kl": 0.048763275146484375,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0178,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02056235447525978,
"mask/share_reasoning": 0.8628644943237305,
"mask/share_step_conf": 0.11266690492630005,
"num_tokens": 17255739.0,
"reward": 0.8158880472183228,
"reward_std": 0.23786352574825287,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.6779488325119019,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6225773692131042,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.5775068998336792,
"adv/mean_abs_reasoning": 0.571548581123352,
"adv/mean_abs_step_conf": 0.7824968099594116,
"adv/ratio_final_to_reasoning": 1.0104248683438535,
"adv/ratio_step_to_reasoning": 1.3690818870050394,
"adv/std_final_conf": 0.7946643233299255,
"adv/std_reasoning": 0.7929192781448364,
"adv/std_step_conf": 0.935107946395874,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.4966442953020134,
"calib/avg_num_step_conf": 7.68359375,
"calib/ece": 0.37930327868852465,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -6.711409395943058e-05,
"calib/mean_conf": 0.9899590163934426,
"calib/mu_c": 0.9899328859060402,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37930327868852465,
"calib/std_conf": 0.0006388711995131108,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9162784810126582,
"calib/step_q_c_n": 1185.0,
"calib/step_q_gap": -0.0010488847162419823,
"calib/step_q_w": 0.9173273657289002,
"calib/step_q_w_n": 782.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2762.0,
"completions/max_terminated_length": 2762.0,
"completions/mean_length": 777.26171875,
"completions/mean_terminated_length": 792.7450561523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 438.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.015236100181937218,
"kl": 0.03882598876953125,
"learning_rate": 3.944444444444445e-06,
"loss": -0.0967,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.018720220774412155,
"mask/share_reasoning": 0.8575639724731445,
"mask/share_step_conf": 0.10418450832366943,
"num_tokens": 17561038.0,
"reward": 0.7194130420684814,
"reward_std": 0.2849310040473938,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.5893566012382507,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.542438268661499,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.5258022546768188,
"adv/mean_abs_reasoning": 0.5158117413520813,
"adv/mean_abs_step_conf": 0.803584098815918,
"adv/ratio_final_to_reasoning": 1.0193685263901704,
"adv/ratio_step_to_reasoning": 1.5579019134180776,
"adv/std_final_conf": 0.7406392693519592,
"adv/std_reasoning": 0.7395347356796265,
"adv/std_step_conf": 0.9328177571296692,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.505,
"calib/avg_num_step_conf": 7.28125,
"calib/ece": 0.3914457831325302,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.000400000000000178,
"calib/mean_conf": 0.989839357429719,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9895999999999998,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3914457831325302,
"calib/std_conf": 0.002529801726590142,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9236056838365897,
"calib/step_q_c_n": 1126.0,
"calib/step_q_gap": 0.0004485022647739978,
"calib/step_q_w": 0.9231571815718157,
"calib/step_q_w_n": 738.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1636.0,
"completions/max_terminated_length": 1636.0,
"completions/mean_length": 711.48046875,
"completions/mean_terminated_length": 719.9170532226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 341.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.013194671832025051,
"kl": 0.047607421875,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0722,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.020480435341596603,
"mask/share_reasoning": 0.8568286299705505,
"mask/share_step_conf": 0.11097220331430435,
"num_tokens": 17849425.0,
"reward": 0.7413979768753052,
"reward_std": 0.28497523069381714,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.5900495648384094,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.5810275077819824,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.5531863570213318,
"adv/mean_abs_reasoning": 0.5323112607002258,
"adv/mean_abs_step_conf": 0.7446668148040771,
"adv/ratio_final_to_reasoning": 1.0392159585233007,
"adv/ratio_step_to_reasoning": 1.3989311701287503,
"adv/std_final_conf": 0.7932870984077454,
"adv/std_reasoning": 0.7929216027259827,
"adv/std_step_conf": 0.9346678256988525,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.3828125,
"calib/ece": 0.4249593495934959,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4249593495934959,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9261883408071749,
"calib/step_q_c_n": 1115.0,
"calib/step_q_gap": 0.00030085293132631197,
"calib/step_q_w": 0.9258874878758486,
"calib/step_q_w_n": 1031.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2978.0,
"completions/max_terminated_length": 2978.0,
"completions/mean_length": 755.8125,
"completions/mean_terminated_length": 770.8685302734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 430.0,
"epoch": 0.064,
"grad_norm": 0.012127498164772987,
"kl": 0.04190826416015625,
"learning_rate": 3.88888888888889e-06,
"loss": -0.033,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01983150653541088,
"mask/share_reasoning": 0.8472711443901062,
"mask/share_step_conf": 0.11336609721183777,
"num_tokens": 18151769.0,
"reward": 0.6970077753067017,
"reward_std": 0.28720253705978394,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5512320399284363,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5420023202896118,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.4006597697734833,
"adv/mean_abs_reasoning": 0.3929939270019531,
"adv/mean_abs_step_conf": 0.7636055946350098,
"adv/ratio_final_to_reasoning": 1.0195062626794538,
"adv/ratio_step_to_reasoning": 1.9430468059910115,
"adv/std_final_conf": 0.6817258596420288,
"adv/std_reasoning": 0.6815775632858276,
"adv/std_step_conf": 0.9337095022201538,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.2109375,
"calib/ece": 0.2836507936507937,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2836507936507937,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9289635854341738,
"calib/step_q_c_n": 1428.0,
"calib/step_q_gap": 0.002123822822897692,
"calib/step_q_w": 0.9268397626112761,
"calib/step_q_w_n": 674.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2709.0,
"completions/max_terminated_length": 2709.0,
"completions/mean_length": 701.54296875,
"completions/mean_terminated_length": 701.54296875,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.012162295170128345,
"kl": 0.047260284423828125,
"learning_rate": 3.861111111111112e-06,
"loss": 0.058,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.02129204198718071,
"mask/share_reasoning": 0.8529036045074463,
"mask/share_step_conf": 0.1258043497800827,
"num_tokens": 18435428.0,
"reward": 0.8267836570739746,
"reward_std": 0.21470335125923157,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.697089433670044,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6213215589523315,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.5658851265907288,
"adv/mean_abs_reasoning": 0.5585103034973145,
"adv/mean_abs_step_conf": 0.7698912620544434,
"adv/ratio_final_to_reasoning": 1.0132044530731736,
"adv/ratio_step_to_reasoning": 1.3784728002930124,
"adv/std_final_conf": 0.7942206859588623,
"adv/std_reasoning": 0.7928854823112488,
"adv/std_step_conf": 0.9352415204048157,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.94921875,
"calib/ece": 0.41570281124498,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.220446049250313e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41570281124498,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9266353982300884,
"calib/step_q_c_n": 1130.0,
"calib/step_q_gap": 0.0038729672908619506,
"calib/step_q_w": 0.9227624309392265,
"calib/step_q_w_n": 905.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2037.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 729.51171875,
"completions/mean_terminated_length": 735.2559204101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 430.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.01219266839325428,
"kl": 0.046234130859375,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0092,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.020023802295327187,
"mask/share_reasoning": 0.8546423316001892,
"mask/share_step_conf": 0.11752137541770935,
"num_tokens": 18729263.0,
"reward": 0.7053297758102417,
"reward_std": 0.29709774255752563,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5667777061462402,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.5376317501068115,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.45947033166885376,
"adv/mean_abs_reasoning": 0.4331585168838501,
"adv/mean_abs_step_conf": 0.7686165571212769,
"adv/ratio_final_to_reasoning": 1.0607440781132305,
"adv/ratio_step_to_reasoning": 1.7744463681580538,
"adv/std_final_conf": 0.7218553423881531,
"adv/std_reasoning": 0.7207862734794617,
"adv/std_step_conf": 0.9341201782226562,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5048076923076923,
"calib/avg_num_step_conf": 7.97265625,
"calib/ece": 0.41247967479674796,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0006730769230766098,
"calib/mean_conf": 0.9897154471544716,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9893269230769233,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41247967479674796,
"calib/std_conf": 0.004453956540548361,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9278424015009382,
"calib/step_q_c_n": 1066.0,
"calib/step_q_gap": 0.0019449656035023244,
"calib/step_q_w": 0.9258974358974359,
"calib/step_q_w_n": 975.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3032.0,
"completions/max_terminated_length": 3032.0,
"completions/mean_length": 723.21875,
"completions/mean_terminated_length": 743.5501708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 382.0,
"epoch": 0.0672,
"grad_norm": 0.012249491177499294,
"kl": 0.04229736328125,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.0804,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019764099270105362,
"mask/share_reasoning": 0.8458855152130127,
"mask/share_step_conf": 0.10700662434101105,
"num_tokens": 19023047.0,
"reward": 0.7130417823791504,
"reward_std": 0.2504788637161255,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5632386207580566,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.559719979763031,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.4521995484828949,
"adv/mean_abs_reasoning": 0.45031213760375977,
"adv/mean_abs_step_conf": 0.7582074999809265,
"adv/ratio_final_to_reasoning": 1.0041913391213007,
"adv/ratio_step_to_reasoning": 1.683737649212758,
"adv/std_final_conf": 0.7208352088928223,
"adv/std_reasoning": 0.7206861972808838,
"adv/std_step_conf": 0.9330006241798401,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 7.66015625,
"calib/ece": 0.24099601593625497,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -3.3306690738754696e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9900000000000003,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24099601593625497,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9253862068965517,
"calib/step_q_c_n": 1450.0,
"calib/step_q_gap": 0.0016680072879411423,
"calib/step_q_w": 0.9237181996086106,
"calib/step_q_w_n": 511.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2761.0,
"completions/max_terminated_length": 2761.0,
"completions/mean_length": 705.65625,
"completions/mean_terminated_length": 714.0237426757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.012615746818482876,
"kl": 0.047271728515625,
"learning_rate": 3.777777777777778e-06,
"loss": -0.0807,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.02077304944396019,
"mask/share_reasoning": 0.8502489328384399,
"mask/share_step_conf": 0.11725927889347076,
"num_tokens": 19307471.0,
"reward": 0.8746538758277893,
"reward_std": 0.24922211468219757,
"rewards/accuracy_reward_step": 0.734375,
"rewards/final_brier_reward_step": 0.735292911529541,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": 0.6718271970748901,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.3024356961250305,
"adv/mean_abs_reasoning": 0.3007676899433136,
"adv/mean_abs_step_conf": 0.7550225257873535,
"adv/ratio_final_to_reasoning": 1.0055458290151822,
"adv/ratio_step_to_reasoning": 2.510317933185092,
"adv/std_final_conf": 0.5958569049835205,
"adv/std_reasoning": 0.5959718823432922,
"adv/std_step_conf": 0.9315097332000732,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4967741935483871,
"calib/avg_num_step_conf": 8.37109375,
"calib/ece": 0.3772332015810277,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.00019354838709662037,
"calib/mean_conf": 0.9898814229249012,
"calib/mu_c": 0.9898064516129034,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3772332015810277,
"calib/std_conf": 0.0018823527114293543,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9284633757961784,
"calib/step_q_c_n": 1256.0,
"calib/step_q_gap": 0.0004024964275199938,
"calib/step_q_w": 0.9280608793686584,
"calib/step_q_w_n": 887.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1362.0,
"completions/max_terminated_length": 1362.0,
"completions/mean_length": 656.8984375,
"completions/mean_terminated_length": 664.687744140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 368.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.014276616275310516,
"kl": 0.04969024658203125,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0374,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02203986421227455,
"mask/share_reasoning": 0.8352499008178711,
"mask/share_step_conf": 0.13099150359630585,
"num_tokens": 19580661.0,
"reward": 0.744539737701416,
"reward_std": 0.16033200919628143,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.609114408493042,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": 0.5619961619377136,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.5220428705215454,
"adv/mean_abs_reasoning": 0.5041089653968811,
"adv/mean_abs_step_conf": 0.7199528217315674,
"adv/ratio_final_to_reasoning": 1.0355754536334125,
"adv/ratio_step_to_reasoning": 1.4281690490561978,
"adv/std_final_conf": 0.7923429608345032,
"adv/std_reasoning": 0.792874813079834,
"adv/std_step_conf": 0.9343789219856262,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.6796875,
"calib/ece": 0.4776033057851241,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4776033057851241,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.92610710607621,
"calib/step_q_c_n": 971.0,
"calib/step_q_gap": -1.5995442574912033e-05,
"calib/step_q_w": 0.926123101518785,
"calib/step_q_w_n": 1251.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2566.0,
"completions/max_terminated_length": 2566.0,
"completions/mean_length": 768.4765625,
"completions/mean_terminated_length": 799.7153930664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 368.0,
"epoch": 0.0704,
"grad_norm": 0.01726270467042923,
"kl": 0.042469024658203125,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.1283,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.018511904403567314,
"mask/share_reasoning": 0.836242139339447,
"mask/share_step_conf": 0.1061834916472435,
"num_tokens": 19883743.0,
"reward": 0.6393930912017822,
"reward_std": 0.273938924074173,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.49349915981292725,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.499349445104599,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.39268285036087036,
"adv/mean_abs_reasoning": 0.38221269845962524,
"adv/mean_abs_step_conf": 0.7692726254463196,
"adv/ratio_final_to_reasoning": 1.0273935218359866,
"adv/ratio_step_to_reasoning": 2.0126820185373333,
"adv/std_final_conf": 0.6624247431755066,
"adv/std_reasoning": 0.661316454410553,
"adv/std_step_conf": 0.9332561492919922,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.1953125,
"calib/ece": 0.36944664031620555,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.220446049250313e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36944664031620555,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9247068273092371,
"calib/step_q_c_n": 1245.0,
"calib/step_q_gap": 0.0067466866292840155,
"calib/step_q_w": 0.9179601406799531,
"calib/step_q_w_n": 853.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 749.734375,
"completions/mean_terminated_length": 755.6378173828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.014443333260715008,
"kl": 0.043788909912109375,
"learning_rate": 3.694444444444445e-06,
"loss": -0.0186,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.01966247148811817,
"mask/share_reasoning": 0.8596717715263367,
"mask/share_step_conf": 0.11285325139760971,
"num_tokens": 20180683.0,
"reward": 0.7620497345924377,
"reward_std": 0.21320092678070068,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6206823587417603,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": 0.5831045508384705,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.38533303141593933,
"adv/mean_abs_reasoning": 0.3794650435447693,
"adv/mean_abs_step_conf": 0.7499998807907104,
"adv/ratio_final_to_reasoning": 1.0154638430363816,
"adv/ratio_step_to_reasoning": 1.9764663268705684,
"adv/std_final_conf": 0.6597793698310852,
"adv/std_reasoning": 0.661293625831604,
"adv/std_step_conf": 0.9329918622970581,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.22265625,
"calib/ece": 0.3965040650406504,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3965040650406504,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9219471947194721,
"calib/step_q_c_n": 1212.0,
"calib/step_q_gap": 0.013806097295059994,
"calib/step_q_w": 0.9081410974244121,
"calib/step_q_w_n": 893.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2514.0,
"completions/max_terminated_length": 2514.0,
"completions/mean_length": 699.83984375,
"completions/mean_terminated_length": 716.6360473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 335.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.013353622518479824,
"kl": 0.046169281005859375,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.1861,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.02063082344830036,
"mask/share_reasoning": 0.8354395627975464,
"mask/share_step_conf": 0.12049206346273422,
"num_tokens": 20463930.0,
"reward": 0.7265154123306274,
"reward_std": 0.1846695840358734,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.5780289173126221,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5687519311904907,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.5332236289978027,
"adv/mean_abs_reasoning": 0.5274813175201416,
"adv/mean_abs_step_conf": 0.7604198455810547,
"adv/ratio_final_to_reasoning": 1.0108862840956294,
"adv/ratio_step_to_reasoning": 1.4416052669998467,
"adv/std_final_conf": 0.7735781073570251,
"adv/std_reasoning": 0.7754120230674744,
"adv/std_step_conf": 0.9339145421981812,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.4966442953020134,
"calib/avg_num_step_conf": 8.05078125,
"calib/ece": 0.37930327868852465,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -6.711409395943058e-05,
"calib/mean_conf": 0.9899590163934426,
"calib/mu_c": 0.9899328859060402,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37930327868852465,
"calib/std_conf": 0.0006388711995131111,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9190017825311944,
"calib/step_q_c_n": 1122.0,
"calib/step_q_gap": -0.001743691377218859,
"calib/step_q_w": 0.9207454739084132,
"calib/step_q_w_n": 939.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2638.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 778.88671875,
"completions/mean_terminated_length": 794.4024047851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.0736,
"grad_norm": 0.011452188715338707,
"kl": 0.036396026611328125,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0839,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.018734045326709747,
"mask/share_reasoning": 0.8554292917251587,
"mask/share_step_conf": 0.10630541294813156,
"num_tokens": 20767821.0,
"reward": 0.7228295803070068,
"reward_std": 0.25022977590560913,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.5893566608428955,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.5492713451385498,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.42671480774879456,
"adv/mean_abs_reasoning": 0.41062837839126587,
"adv/mean_abs_step_conf": 0.7731932401657104,
"adv/ratio_final_to_reasoning": 1.0391751525322022,
"adv/ratio_step_to_reasoning": 1.8829513030611242,
"adv/std_final_conf": 0.6997833847999573,
"adv/std_reasoning": 0.7014550566673279,
"adv/std_step_conf": 0.9338797330856323,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5061626385732976,
"calib/avg_num_step_conf": 8.01953125,
"calib/ece": 0.40758196721311474,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9918032786885246,
"calib/gap": 0.009264614749018851,
"calib/mean_conf": 0.9854508196721311,
"calib/mu_c": 0.9893617021276594,
"calib/mu_w": 0.9800970873786405,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40758196721311474,
"calib/std_conf": 0.06350585392650582,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9239586919104991,
"calib/step_q_c_n": 1162.0,
"calib/step_q_gap": 0.005395280013753778,
"calib/step_q_w": 0.9185634118967453,
"calib/step_q_w_n": 891.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3070.0,
"completions/max_terminated_length": 3070.0,
"completions/mean_length": 782.34765625,
"completions/mean_terminated_length": 801.1240234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 321.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.01270360592752695,
"kl": 0.037288665771484375,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.1025,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01925162971019745,
"mask/share_reasoning": 0.8457670211791992,
"mask/share_step_conf": 0.11154384166002274,
"num_tokens": 21075094.0,
"reward": 0.7064595222473145,
"reward_std": 0.2108471393585205,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5627511739730835,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.5493866801261902,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.45908522605895996,
"adv/mean_abs_reasoning": 0.4423348605632782,
"adv/mean_abs_step_conf": 0.7758592367172241,
"adv/ratio_final_to_reasoning": 1.037868065551857,
"adv/ratio_step_to_reasoning": 1.7540087971570435,
"adv/std_final_conf": 0.7211512327194214,
"adv/std_reasoning": 0.7207404971122742,
"adv/std_step_conf": 0.932945966720581,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.234375,
"calib/ece": 0.4290243902439024,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999999,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4290243902439024,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9207659208261618,
"calib/step_q_c_n": 1162.0,
"calib/step_q_gap": 0.0003430878451893271,
"calib/step_q_w": 0.9204228329809725,
"calib/step_q_w_n": 946.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2836.0,
"completions/max_terminated_length": 2836.0,
"completions/mean_length": 774.99609375,
"completions/mean_terminated_length": 787.2976684570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 399.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.011231261305510998,
"kl": 0.03789520263671875,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0531,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01901676505804062,
"mask/share_reasoning": 0.8521870374679565,
"mask/share_step_conf": 0.11317116022109985,
"num_tokens": 21377901.0,
"reward": 0.6887651085853577,
"reward_std": 0.23772430419921875,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.547403872013092,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5301263332366943,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.4625837206840515,
"adv/mean_abs_reasoning": 0.45626822113990784,
"adv/mean_abs_step_conf": 0.7668143510818481,
"adv/ratio_final_to_reasoning": 1.013841637991718,
"adv/ratio_step_to_reasoning": 1.6806218701054703,
"adv/std_final_conf": 0.7177895307540894,
"adv/std_reasoning": 0.7206159830093384,
"adv/std_step_conf": 0.9346785545349121,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.640625,
"calib/ece": 0.4438152610441768,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.1102230246251565e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4438152610441768,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9228031634446396,
"calib/step_q_c_n": 1138.0,
"calib/step_q_gap": 0.011536869217451562,
"calib/step_q_w": 0.9112662942271881,
"calib/step_q_w_n": 1074.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3043.0,
"completions/max_terminated_length": 3043.0,
"completions/mean_length": 726.64453125,
"completions/mean_terminated_length": 741.1195678710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 427.0,
"epoch": 0.0768,
"grad_norm": 0.011547032743692398,
"kl": 0.04325103759765625,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0638,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019756484776735306,
"mask/share_reasoning": 0.840281069278717,
"mask/share_step_conf": 0.12043121457099915,
"num_tokens": 21668330.0,
"reward": 0.7075121402740479,
"reward_std": 0.23124364018440247,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5399808883666992,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.5742621421813965,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.48381316661834717,
"adv/mean_abs_reasoning": 0.4664658010005951,
"adv/mean_abs_step_conf": 0.7619782090187073,
"adv/ratio_final_to_reasoning": 1.037188933423503,
"adv/ratio_step_to_reasoning": 1.633513555300778,
"adv/std_final_conf": 0.7406063079833984,
"adv/std_reasoning": 0.7393958568572998,
"adv/std_step_conf": 0.9333019852638245,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5113636363636364,
"calib/avg_num_step_conf": 8.41796875,
"calib/ece": 0.34019920318725105,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": 0.001136363636363491,
"calib/mean_conf": 0.989601593625498,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9888636363636365,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.34019920318725105,
"calib/std_conf": 0.005701806298877532,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9204226415094339,
"calib/step_q_c_n": 1325.0,
"calib/step_q_gap": 0.006506978858831558,
"calib/step_q_w": 0.9139156626506023,
"calib/step_q_w_n": 830.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2517.0,
"completions/max_terminated_length": 2517.0,
"completions/mean_length": 739.50390625,
"completions/mean_terminated_length": 748.2727661132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 439.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.011704598553478718,
"kl": 0.03731536865234375,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.018,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.019500505179166794,
"mask/share_reasoning": 0.8524847626686096,
"mask/share_step_conf": 0.1162959635257721,
"num_tokens": 21964675.0,
"reward": 0.7815718054771423,
"reward_std": 0.25788426399230957,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6402535438537598,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.6010150909423828,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.43151819705963135,
"adv/mean_abs_reasoning": 0.4239124059677124,
"adv/mean_abs_step_conf": 0.7469241619110107,
"adv/ratio_final_to_reasoning": 1.0179418931478459,
"adv/ratio_step_to_reasoning": 1.7619775958335617,
"adv/std_final_conf": 0.7015545964241028,
"adv/std_reasoning": 0.7014573216438293,
"adv/std_step_conf": 0.9340309500694275,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5102040816326531,
"calib/avg_num_step_conf": 8.33203125,
"calib/ece": 0.3823886639676113,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9919028340080972,
"calib/gap": 0.011020408163265327,
"calib/mean_conf": 0.9856275303643725,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9789795918367347,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3823886639676113,
"calib/std_conf": 0.06310064362496394,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9235135135135136,
"calib/step_q_c_n": 1258.0,
"calib/step_q_gap": 0.012222084942085099,
"calib/step_q_w": 0.9112914285714285,
"calib/step_q_w_n": 875.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2662.0,
"completions/max_terminated_length": 2662.0,
"completions/mean_length": 758.73828125,
"completions/mean_terminated_length": 773.8526000976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 424.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.011972285807132721,
"kl": 0.037693023681640625,
"learning_rate": 3.5e-06,
"loss": -0.0833,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.018995292484760284,
"mask/share_reasoning": 0.8444027900695801,
"mask/share_step_conf": 0.11707063764333725,
"num_tokens": 22262840.0,
"reward": 0.7532713413238525,
"reward_std": 0.23117206990718842,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.5940839648246765,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.603083610534668,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.41009172797203064,
"adv/mean_abs_reasoning": 0.4013540744781494,
"adv/mean_abs_step_conf": 0.7563887238502502,
"adv/ratio_final_to_reasoning": 1.0217704367527405,
"adv/ratio_step_to_reasoning": 1.8845921144159947,
"adv/std_final_conf": 0.6615861058235168,
"adv/std_reasoning": 0.6613550782203674,
"adv/std_step_conf": 0.9317781329154968,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5013578522656735,
"calib/avg_num_step_conf": 8.97265625,
"calib/ece": 0.2767330677290837,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.7157045313308537e-05,
"calib/mean_conf": 0.9898804780876495,
"calib/mu_c": 0.9898882681564246,
"calib/mu_w": 0.9898611111111113,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2767330677290837,
"calib/std_conf": 0.001086707704939113,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9227473219911784,
"calib/step_q_c_n": 1587.0,
"calib/step_q_gap": 0.006986758610896593,
"calib/step_q_w": 0.9157605633802818,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1547.0,
"completions/max_terminated_length": 1547.0,
"completions/mean_length": 703.82421875,
"completions/mean_terminated_length": 717.8446655273438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 406.0,
"epoch": 0.08,
"grad_norm": 0.011645482853055,
"kl": 0.042751312255859375,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0808,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.020144283771514893,
"mask/share_reasoning": 0.8334736824035645,
"mask/share_step_conf": 0.12685082852840424,
"num_tokens": 22547771.0,
"reward": 0.8555404543876648,
"reward_std": 0.20708394050598145,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7048202753067017,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": 0.6703230142593384,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.29195424914360046,
"adv/mean_abs_reasoning": 0.2810046076774597,
"adv/mean_abs_step_conf": 0.7327713966369629,
"adv/ratio_final_to_reasoning": 1.0389660566659067,
"adv/ratio_step_to_reasoning": 2.6076846308443677,
"adv/std_final_conf": 0.5702286958694458,
"adv/std_reasoning": 0.5727683901786804,
"adv/std_step_conf": 0.9335572123527527,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.921875,
"calib/ece": 0.300204081632653,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 2.220446049250313e-16,
"calib/mean_conf": 0.99,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9899999999999998,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.300204081632653,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9203335750543873,
"calib/step_q_c_n": 1379.0,
"calib/step_q_gap": 0.019681641352729873,
"calib/step_q_w": 0.9006519337016574,
"calib/step_q_w_n": 905.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2994.0,
"completions/max_terminated_length": 2994.0,
"completions/mean_length": 778.91015625,
"completions/mean_terminated_length": 804.0362548828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 437.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.012540625408291817,
"kl": 0.03820037841796875,
"learning_rate": 3.444444444444445e-06,
"loss": -0.1275,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.018356502056121826,
"mask/share_reasoning": 0.8417627811431885,
"mask/share_step_conf": 0.10863067209720612,
"num_tokens": 22850228.0,
"reward": 0.7930084466934204,
"reward_std": 0.15477627515792847,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.6581863164901733,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6059556007385254,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.5305771827697754,
"adv/mean_abs_reasoning": 0.5090773701667786,
"adv/mean_abs_step_conf": 0.7380466461181641,
"adv/ratio_final_to_reasoning": 1.042232897910888,
"adv/ratio_step_to_reasoning": 1.4497730391676475,
"adv/std_final_conf": 0.7934849858283997,
"adv/std_reasoning": 0.7929652333259583,
"adv/std_step_conf": 0.93379807472229,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5125,
"calib/avg_num_step_conf": 9.19140625,
"calib/ece": 0.2594190871369296,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004449300699300718,
"calib/mean_conf": 0.9897095435684649,
"calib/mu_c": 0.9898295454545455,
"calib/mu_w": 0.9893846153846154,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2594190871369296,
"calib/std_conf": 0.0027927499867989804,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.922860962566845,
"calib/step_q_c_n": 1496.0,
"calib/step_q_gap": -0.004105198459992909,
"calib/step_q_w": 0.9269661610268379,
"calib/step_q_w_n": 857.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2544.0,
"completions/max_terminated_length": 2544.0,
"completions/mean_length": 763.61328125,
"completions/mean_terminated_length": 791.437255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.01434855256229639,
"kl": 0.038990020751953125,
"learning_rate": 3.416666666666667e-06,
"loss": -0.1243,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018347784876823425,
"mask/share_reasoning": 0.8309397101402283,
"mask/share_step_conf": 0.11555621773004532,
"num_tokens": 23150377.0,
"reward": 0.8227552771568298,
"reward_std": 0.2816285789012909,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.6927835941314697,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.6269456148147583,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.5190737843513489,
"adv/mean_abs_reasoning": 0.5054394602775574,
"adv/mean_abs_step_conf": 0.7535461783409119,
"adv/ratio_final_to_reasoning": 1.026975187228762,
"adv/ratio_step_to_reasoning": 1.4908732648754985,
"adv/std_final_conf": 0.7569723129272461,
"adv/std_reasoning": 0.7576473951339722,
"adv/std_step_conf": 0.9331627488136292,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 8.32421875,
"calib/ece": 0.38516129032258073,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -2.220446049250313e-16,
"calib/mean_conf": 0.9900000000000001,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38516129032258073,
"calib/std_conf": 1.1102230246251565e-16,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9183696529459242,
"calib/step_q_c_n": 1239.0,
"calib/step_q_gap": 0.0008023883719331382,
"calib/step_q_w": 0.9175672645739911,
"calib/step_q_w_n": 892.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1651.0,
"completions/max_terminated_length": 1651.0,
"completions/mean_length": 810.0390625,
"completions/mean_terminated_length": 826.17529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.0832,
"grad_norm": 0.014095221646130085,
"kl": 0.034351348876953125,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.0805,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.017470698803663254,
"mask/share_reasoning": 0.8594420552253723,
"mask/share_step_conf": 0.10355598479509354,
"num_tokens": 23465771.0,
"reward": 0.7383511066436768,
"reward_std": 0.2517282962799072,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.5934968590736389,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.5722677707672119,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.5218999981880188,
"adv/mean_abs_reasoning": 0.4994032084941864,
"adv/mean_abs_step_conf": 0.7597758769989014,
"adv/ratio_final_to_reasoning": 1.0450473471359252,
"adv/ratio_step_to_reasoning": 1.521367632558464,
"adv/std_final_conf": 0.7779203653335571,
"adv/std_reasoning": 0.7754961848258972,
"adv/std_step_conf": 0.935109555721283,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.49411764705882355,
"calib/avg_num_step_conf": 8.578125,
"calib/ece": 0.2843983402489627,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.000294117647058445,
"calib/mean_conf": 0.989792531120332,
"calib/mu_c": 0.9897058823529412,
"calib/mu_w": 0.9899999999999997,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2843983402489627,
"calib/std_conf": 0.002647810146646279,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9181918008784773,
"calib/step_q_c_n": 1366.0,
"calib/step_q_gap": -0.0019286810492336048,
"calib/step_q_w": 0.9201204819277109,
"calib/step_q_w_n": 830.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 777.83984375,
"completions/mean_terminated_length": 809.4592895507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 477.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.014404214918613434,
"kl": 0.033031463623046875,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.1326,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.01771673932671547,
"mask/share_reasoning": 0.8376704454421997,
"mask/share_step_conf": 0.10555030405521393,
"num_tokens": 23771274.0,
"reward": 0.8042483925819397,
"reward_std": 0.27935120463371277,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6695046424865723,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.6178983449935913,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.42294803261756897,
"adv/mean_abs_reasoning": 0.4056248068809509,
"adv/mean_abs_step_conf": 0.75342857837677,
"adv/ratio_final_to_reasoning": 1.0427075105929169,
"adv/ratio_step_to_reasoning": 1.857451925019709,
"adv/std_final_conf": 0.7015489935874939,
"adv/std_reasoning": 0.7013869285583496,
"adv/std_step_conf": 0.9324942231178284,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5080645161290323,
"calib/avg_num_step_conf": 9.09375,
"calib/ece": 0.2388755020080322,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00048387096774160643,
"calib/mean_conf": 0.9898795180722892,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9895161290322584,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2388755020080322,
"calib/std_conf": 0.001897351294942607,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9194984709480123,
"calib/step_q_c_n": 1635.0,
"calib/step_q_gap": -0.00651884506930378,
"calib/step_q_w": 0.9260173160173161,
"calib/step_q_w_n": 693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2399.0,
"completions/max_terminated_length": 2399.0,
"completions/mean_length": 723.2734375,
"completions/mean_terminated_length": 737.6812744140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 453.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.010196685791015625,
"kl": 0.036296844482421875,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0607,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019432738423347473,
"mask/share_reasoning": 0.837583065032959,
"mask/share_step_conf": 0.12345296889543533,
"num_tokens": 24058592.0,
"reward": 0.8649125695228577,
"reward_std": 0.2212262749671936,
"rewards/accuracy_reward_step": 0.73046875,
"rewards/final_brier_reward_step": 0.7354437112808228,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.6537563800811768,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.4704670011997223,
"adv/mean_abs_reasoning": 0.4546546936035156,
"adv/mean_abs_step_conf": 0.7636871337890625,
"adv/ratio_final_to_reasoning": 1.0347787184838697,
"adv/ratio_step_to_reasoning": 1.6797080169484415,
"adv/std_final_conf": 0.7218642830848694,
"adv/std_reasoning": 0.7208608388900757,
"adv/std_step_conf": 0.9334217309951782,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5072463768115942,
"calib/avg_num_step_conf": 9.02734375,
"calib/ece": 0.2712653061224489,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9959183673469387,
"calib/gap": 0.0013043478260872376,
"calib/mean_conf": 0.9896326530612244,
"calib/mu_c": 0.9900000000000001,
"calib/mu_w": 0.9886956521739129,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2712653061224489,
"calib/std_conf": 0.00573814261903346,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9083748271092671,
"calib/step_q_c_n": 1446.0,
"calib/step_q_gap": 0.01323031843874689,
"calib/step_q_w": 0.8951445086705202,
"calib/step_q_w_n": 865.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2704.0,
"completions/max_terminated_length": 2704.0,
"completions/mean_length": 750.98046875,
"completions/mean_terminated_length": 778.3441772460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.0864,
"grad_norm": 0.020107468590140343,
"kl": 0.06332015991210938,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.1792,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.018680231645703316,
"mask/share_reasoning": 0.8345333337783813,
"mask/share_step_conf": 0.11163021624088287,
"num_tokens": 24357091.0,
"reward": 0.8299020528793335,
"reward_std": 0.2704859673976898,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.6934593915939331,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6374385356903076,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.5415908694267273,
"adv/mean_abs_reasoning": 0.5319398641586304,
"adv/mean_abs_step_conf": 0.7804492712020874,
"adv/ratio_final_to_reasoning": 1.018143038186021,
"adv/ratio_step_to_reasoning": 1.4671757538542904,
"adv/std_final_conf": 0.7750914692878723,
"adv/std_reasoning": 0.7754891514778137,
"adv/std_step_conf": 0.9343975782394409,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5054945054945055,
"calib/avg_num_step_conf": 9.28515625,
"calib/ece": 0.35542168674698804,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00010989010988993186,
"calib/mean_conf": 0.9899598393574298,
"calib/mu_c": 0.9899999999999998,
"calib/mu_w": 0.9898901098901098,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35542168674698804,
"calib/std_conf": 0.0006324504316475354,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9176951130561636,
"calib/step_q_c_n": 1371.0,
"calib/step_q_gap": -0.006678644399104927,
"calib/step_q_w": 0.9243737574552685,
"calib/step_q_w_n": 1006.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2241.0,
"completions/max_terminated_length": 2241.0,
"completions/mean_length": 732.8984375,
"completions/mean_terminated_length": 744.5317993164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.010728063993155956,
"kl": 0.034942626953125,
"learning_rate": 3.277777777777778e-06,
"loss": -0.0595,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019511591643095016,
"mask/share_reasoning": 0.8416132926940918,
"mask/share_step_conf": 0.1232500821352005,
"num_tokens": 24650265.0,
"reward": 0.768690288066864,
"reward_std": 0.2716025710105896,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6242765188217163,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.5951353311538696,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.3733702301979065,
"adv/mean_abs_reasoning": 0.35768407583236694,
"adv/mean_abs_step_conf": 0.758823812007904,
"adv/ratio_final_to_reasoning": 1.0438547741579949,
"adv/ratio_step_to_reasoning": 2.121491738881706,
"adv/std_final_conf": 0.659427285194397,
"adv/std_reasoning": 0.6613901257514954,
"adv/std_step_conf": 0.9329133629798889,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.4963768115942029,
"calib/avg_num_step_conf": 9.39453125,
"calib/ece": 0.42205761316872437,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -7.24637681158713e-05,
"calib/mean_conf": 0.9899588477366256,
"calib/mu_c": 0.9899275362318841,
"calib/mu_w": 0.99,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.42205761316872437,
"calib/std_conf": 0.000640178978852019,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9205405405405407,
"calib/step_q_c_n": 1147.0,
"calib/step_q_gap": 0.000993640699523013,
"calib/step_q_w": 0.9195468998410177,
"calib/step_q_w_n": 1258.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2190.0,
"completions/max_terminated_length": 2190.0,
"completions/mean_length": 800.12890625,
"completions/mean_terminated_length": 829.283447265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 468.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.013022164814174175,
"kl": 0.031452178955078125,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.2175,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.017471952363848686,
"mask/share_reasoning": 0.8387032151222229,
"mask/share_step_conf": 0.10866856575012207,
"num_tokens": 24962362.0,
"reward": 0.6908458471298218,
"reward_std": 0.18251261115074158,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.54716956615448,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.5368659496307373,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.5106885433197021,
"adv/mean_abs_reasoning": 0.5043027997016907,
"adv/mean_abs_step_conf": 0.776948869228363,
"adv/ratio_final_to_reasoning": 1.0126625186728861,
"adv/ratio_step_to_reasoning": 1.5406396111382887,
"adv/std_final_conf": 0.7742244601249695,
"adv/std_reasoning": 0.7754048705101013,
"adv/std_step_conf": 0.9331316351890564,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5164835164835164,
"calib/avg_num_step_conf": 9.15625,
"calib/ece": 0.35330612244897963,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9918367346938776,
"calib/gap": 0.021868131868131968,
"calib/mean_conf": 0.9818775510204082,
"calib/mu_c": 0.9899999999999999,
"calib/mu_w": 0.9681318681318679,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35330612244897963,
"calib/std_conf": 0.08908002195986395,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9211308203991131,
"calib/step_q_c_n": 1353.0,
"calib/step_q_gap": 0.0016555429016358314,
"calib/step_q_w": 0.9194752774974773,
"calib/step_q_w_n": 991.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 772.91796875,
"completions/mean_terminated_length": 794.6465454101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 394.0,
"epoch": 0.0896,
"grad_norm": 0.011630590073764324,
"kl": 0.033512115478515625,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.1131,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01856730505824089,
"mask/share_reasoning": 0.8370269536972046,
"mask/share_step_conf": 0.11706199496984482,
"num_tokens": 25266149.0,
"reward": 0.7697278261184692,
"reward_std": 0.2595728933811188,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6163101196289062,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6114266514778137,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.5432265996932983,
"adv/mean_abs_reasoning": 0.5405408143997192,
"adv/mean_abs_step_conf": 0.7449895143508911,
"adv/ratio_final_to_reasoning": 1.0049687002757817,
"adv/ratio_step_to_reasoning": 1.3782299032834662,
"adv/std_final_conf": 0.792640745639801,
"adv/std_reasoning": 0.7929560542106628,
"adv/std_step_conf": 0.9342106580734253,
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.5059523809523809,
"calib/avg_num_step_conf": 9.296875,
"calib/ece": 0.35034334763948494,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00047619047619074095,
"calib/mean_conf": 0.9898283261802575,
"calib/mu_c": 0.99,
"calib/mu_w": 0.9895238095238093,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.35034334763948494,
"calib/std_conf": 0.002614857718751559,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.918590240123935,
"calib/step_q_c_n": 1291.0,
"calib/step_q_gap": -0.008581477047782116,
"calib/step_q_w": 0.9271717171717171,
"calib/step_q_w_n": 1089.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2893.0,
"completions/max_terminated_length": 2893.0,
"completions/mean_length": 790.203125,
"completions/mean_terminated_length": 842.8833618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.014631648547947407,
"kl": 0.031978607177734375,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.2057,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.017018930986523628,
"mask/share_reasoning": 0.8115172386169434,
"mask/share_step_conf": 0.10896383225917816,
"num_tokens": 25576265.0,
"reward": 0.7271938323974609,
"reward_std": 0.26602375507354736,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.588728129863739,
"rewards/format_reward_step": 0.90625,
"rewards/step_l1_reward": 0.5680031776428223,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.5152809619903564,
"adv/mean_abs_reasoning": 0.5122286677360535,
"adv/mean_abs_step_conf": 0.7808734774589539,
"adv/ratio_final_to_reasoning": 1.0059588508932806,
"adv/ratio_step_to_reasoning": 1.5244626602221538,
"adv/std_final_conf": 0.7563278675079346,
"adv/std_reasoning": 0.7577018141746521,
"adv/std_step_conf": 0.9337679147720337,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5006368999731832,
"calib/avg_num_step_conf": 9.125,
"calib/ece": 0.4511428571428572,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 1.2737999463663385e-05,
"calib/mean_conf": 0.9899183673469388,
"calib/mu_c": 0.9899242424242424,
"calib/mu_w": 0.9899115044247787,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4511428571428572,
"calib/std_conf": 0.0008998125585734132,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.917151515151515,
"calib/step_q_c_n": 1155.0,
"calib/step_q_gap": -0.00500767197803631,
"calib/step_q_w": 0.9221591871295514,
"calib/step_q_w_n": 1181.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2438.0,
"completions/max_terminated_length": 2438.0,
"completions/mean_length": 782.5546875,
"completions/mean_terminated_length": 811.06884765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 419.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.012190482579171658,
"kl": 0.03171539306640625,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.119,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.018094191327691078,
"mask/share_reasoning": 0.8291984796524048,
"mask/share_step_conf": 0.11755112558603287,
"num_tokens": 25882111.0,
"reward": 0.6687038540840149,
"reward_std": 0.2648327946662903,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5242776870727539,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.5201611518859863,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.4775123596191406,
"adv/mean_abs_reasoning": 0.45811158418655396,
"adv/mean_abs_step_conf": 0.770564079284668,
"adv/ratio_final_to_reasoning": 1.0423494539371574,
"adv/ratio_step_to_reasoning": 1.682044519028962,
"adv/std_final_conf": 0.7239235639572144,
"adv/std_reasoning": 0.720928430557251,
"adv/std_step_conf": 0.9331386685371399,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.5131552317344668,
"calib/avg_num_step_conf": 9.30078125,
"calib/ece": 0.21755274261603375,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0008166363084392003,
"calib/mean_conf": 0.989704641350211,
"calib/mu_c": 0.9898907103825135,
"calib/mu_w": 0.9890740740740743,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21755274261603375,
"calib/std_conf": 0.0028159618081795104,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9205956112852663,
"calib/step_q_c_n": 1595.0,
"calib/step_q_gap": 0.010748283040991402,
"calib/step_q_w": 0.9098473282442749,
"calib/step_q_w_n": 786.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 731.484375,
"completions/mean_terminated_length": 786.8067626953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 447.0,
"epoch": 0.0928,
"grad_norm": 0.013589132577180862,
"kl": 0.032924652099609375,
"learning_rate": 3.138888888888889e-06,
"loss": -0.2468,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.01762017048895359,
"mask/share_reasoning": 0.8009259700775146,
"mask/share_step_conf": 0.11114132404327393,
"num_tokens": 26174867.0,
"reward": 0.8554751873016357,
"reward_std": 0.2645605504512787,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7154418230056763,
"rewards/format_reward_step": 0.921875,
"rewards/step_l1_reward": 0.668164849281311,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.4434081017971039,
"adv/mean_abs_reasoning": 0.4059925675392151,
"adv/mean_abs_step_conf": 0.7615506649017334,
"adv/ratio_final_to_reasoning": 1.0921581754185063,
"adv/ratio_step_to_reasoning": 1.8757748929188827,
"adv/std_final_conf": 0.7220824956893921,
"adv/std_reasoning": 0.7014883756637573,
"adv/std_step_conf": 0.9331439137458801,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5073099415204678,
"calib/avg_num_step_conf": 8.7734375,
"calib/ece": 0.2975303643724697,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0001461988304094497,
"calib/mean_conf": 0.989838056680162,
"calib/mu_c": 0.9898830409356724,
"calib/mu_w": 0.9897368421052629,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2975303643724697,
"calib/std_conf": 0.001262223260576511,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9183161764705882,
"calib/step_q_c_n": 1360.0,
"calib/step_q_gap": -0.009471633913158972,
"calib/step_q_w": 0.9277878103837471,
"calib/step_q_w_n": 886.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2986.0,
"completions/max_terminated_length": 2986.0,
"completions/mean_length": 809.953125,
"completions/mean_terminated_length": 826.087646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.013744988478720188,
"kl": 0.029903411865234375,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0728,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.017987487837672234,
"mask/share_reasoning": 0.8538491725921631,
"mask/share_step_conf": 0.10863205790519714,
"num_tokens": 26492063.0,
"reward": 0.8196660280227661,
"reward_std": 0.23798441886901855,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.6738835573196411,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.6396672129631042,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.4719206392765045,
"adv/mean_abs_reasoning": 0.4592221975326538,
"adv/mean_abs_step_conf": 0.7370298504829407,
"adv/ratio_final_to_reasoning": 1.027652064321102,
"adv/ratio_step_to_reasoning": 1.6049525794765895,
"adv/std_final_conf": 0.7544752955436707,
"adv/std_reasoning": 0.7576509118080139,
"adv/std_step_conf": 0.9334758520126343,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.49603174603174605,
"calib/avg_num_step_conf": 9.38671875,
"calib/ece": 0.4604621848739495,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002380952380954815,
"calib/mean_conf": 0.9898739495798319,
"calib/mu_c": 0.9897619047619046,
"calib/mu_w": 0.9900000000000001,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4604621848739495,
"calib/std_conf": 0.0019405215527320172,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9215805168986083,
"calib/step_q_c_n": 1006.0,
"calib/step_q_gap": -0.003208316315421844,
"calib/step_q_w": 0.9247888332140302,
"calib/step_q_w_n": 1397.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2967.0,
"completions/max_terminated_length": 2967.0,
"completions/mean_length": 831.82421875,
"completions/mean_terminated_length": 872.7335815429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 464.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.013959411531686783,
"kl": 0.032283782958984375,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.2316,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.016758153215050697,
"mask/share_reasoning": 0.8345001935958862,
"mask/share_step_conf": 0.10186664760112762,
"num_tokens": 26813898.0,
"reward": 0.6590661406517029,
"reward_std": 0.21499761939048767,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5008386373519897,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.5329186320304871,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.4721667170524597,
"adv/mean_abs_reasoning": 0.4331371486186981,
"adv/mean_abs_step_conf": 0.7640393376350403,
"adv/ratio_final_to_reasoning": 1.0901090302649619,
"adv/ratio_step_to_reasoning": 1.7639663096818416,
"adv/std_final_conf": 0.7346081137657166,
"adv/std_reasoning": 0.7207502126693726,
"adv/std_step_conf": 0.9328594207763672,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.49680641991483787,
"calib/avg_num_step_conf": 9.9609375,
"calib/ece": 0.281687242798354,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 9.826400262069601e-05,
"calib/mean_conf": 0.9895061728395063,
"calib/mu_c": 0.9895348837209301,
"calib/mu_w": 0.9894366197183094,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.281687242798354,
"calib/std_conf": 0.003103655577771987,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9143071161048688,
"calib/step_q_c_n": 1602.0,
"calib/step_q_gap": 0.005562390366472214,
"calib/step_q_w": 0.9087447257383966,
"calib/step_q_w_n": 948.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2874.0,
"completions/max_terminated_length": 2874.0,
"completions/mean_length": 768.1328125,
"completions/mean_terminated_length": 802.620361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 402.0,
"epoch": 0.096,
"grad_norm": 0.013207972049713135,
"kl": 0.0336456298828125,
"learning_rate": 3.055555555555556e-06,
"loss": -0.2008,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01795533299446106,
"mask/share_reasoning": 0.8207667469978333,
"mask/share_step_conf": 0.11830917745828629,
"num_tokens": 27113860.0,
"reward": 0.8244093656539917,
"reward_std": 0.23067374527454376,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6776206493377686,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6469793319702148,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.5017357468605042,
"adv/mean_abs_reasoning": 0.4565935730934143,
"adv/mean_abs_step_conf": 0.7459276914596558,
"adv/ratio_final_to_reasoning": 1.0988672999956008,
"adv/ratio_step_to_reasoning": 1.633679787488044,
"adv/std_final_conf": 0.7723668217658997,
"adv/std_reasoning": 0.7395398616790771,
"adv/std_step_conf": 0.9347891211509705,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.5230179028132993,
"calib/avg_num_step_conf": 9.640625,
"calib/ece": 0.27794979079497917,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.99581589958159,
"calib/gap": 0.0005711849957374948,
"calib/mean_conf": 0.9892468619246862,
"calib/mu_c": 0.9894117647058823,
"calib/mu_w": 0.9888405797101448,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27794979079497917,
"calib/std_conf": 0.006553428674440355,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9171748251748252,
"calib/step_q_c_n": 1430.0,
"calib/step_q_gap": -0.008461012975463866,
"calib/step_q_w": 0.925635838150289,
"calib/step_q_w_n": 1038.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 1890.0,
"completions/max_terminated_length": 1890.0,
"completions/mean_length": 776.66015625,
"completions/mean_terminated_length": 821.5908813476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 441.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.013960733078420162,
"kl": 0.034755706787109375,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.276,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.017011146992444992,
"mask/share_reasoning": 0.8231663107872009,
"mask/share_step_conf": 0.10513506829738617,
"num_tokens": 27420397.0,
"reward": 0.8049872517585754,
"reward_std": 0.25266385078430176,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.6699300408363342,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l1_reward": 0.6205131411552429,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.4236437678337097,
"adv/mean_abs_reasoning": 0.3485650420188904,
"adv/mean_abs_step_conf": 0.7679628133773804,
"adv/ratio_final_to_reasoning": 1.215393733634225,
"adv/ratio_step_to_reasoning": 2.2032123730174895,
"adv/std_final_conf": 0.7064496278762817,
"adv/std_reasoning": 0.6401773691177368,
"adv/std_step_conf": 0.9319038391113281,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5535289452815226,
"calib/avg_num_step_conf": 9.47265625,
"calib/ece": 0.37217391304347824,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.002861485593444124,
"calib/mean_conf": 0.9887747035573122,
"calib/mu_c": 0.9898717948717948,
"calib/mu_w": 0.9870103092783507,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37217391304347824,
"calib/std_conf": 0.0066890737711640105,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9190198863636364,
"calib/step_q_c_n": 1408.0,
"calib/step_q_gap": 0.00184191192902472,
"calib/step_q_w": 0.9171779744346117,
"calib/step_q_w_n": 1017.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1900.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 768.44921875,
"completions/mean_terminated_length": 777.561279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 443.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.01794438250362873,
"kl": 0.036670684814453125,
"learning_rate": 3e-06,
"loss": -0.0148,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.019045470282435417,
"mask/share_reasoning": 0.8425126075744629,
"mask/share_step_conf": 0.12672311067581177,
"num_tokens": 27723840.0,
"reward": 0.7808723449707031,
"reward_std": 0.17103058099746704,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6190499663352966,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": 0.6231634020805359,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.4681822657585144,
"adv/mean_abs_reasoning": 0.4340810775756836,
"adv/mean_abs_step_conf": 0.7634403705596924,
"adv/ratio_final_to_reasoning": 1.078559490253028,
"adv/ratio_step_to_reasoning": 1.7587506343825452,
"adv/std_final_conf": 0.721306324005127,
"adv/std_reasoning": 0.7015017867088318,
"adv/std_step_conf": 0.9331609606742859,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5084011452682339,
"calib/avg_num_step_conf": 10.00390625,
"calib/ece": 0.3366942148760331,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004611211573238849,
"calib/mean_conf": 0.9895867768595042,
"calib/mu_c": 0.989746835443038,
"calib/mu_w": 0.9892857142857141,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3366942148760331,
"calib/std_conf": 0.003121954837607293,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9171076706544687,
"calib/step_q_c_n": 1421.0,
"calib/step_q_gap": -0.007032680222724186,
"calib/step_q_w": 0.9241403508771929,
"calib/step_q_w_n": 1140.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2821.0,
"completions/max_terminated_length": 2821.0,
"completions/mean_length": 762.84765625,
"completions/mean_terminated_length": 803.6583862304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.0992,
"grad_norm": 3.9375298023223877,
"kl": 1.6125869750976562,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.1713,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.017697613686323166,
"mask/share_reasoning": 0.8151333928108215,
"mask/share_step_conf": 0.1163877323269844,
"num_tokens": 28024905.0,
"reward": 0.7663431167602539,
"reward_std": 0.20701514184474945,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6241070032119751,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.5960791110992432,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.521461009979248,
"adv/mean_abs_reasoning": 0.4299072027206421,
"adv/mean_abs_step_conf": 0.7327769994735718,
"adv/ratio_final_to_reasoning": 1.2129617896123004,
"adv/ratio_step_to_reasoning": 1.704500401101066,
"adv/std_final_conf": 0.7765588164329529,
"adv/std_reasoning": 0.7206876277923584,
"adv/std_step_conf": 0.9341228008270264,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.4877741960845978,
"calib/avg_num_step_conf": 9.109375,
"calib/ece": 0.3171250000000002,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.9958333333333333,
"calib/gap": -0.000401761144743773,
"calib/mean_conf": 0.9879583333333335,
"calib/mu_c": 0.9878260869565217,
"calib/mu_w": 0.9882278481012655,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3171250000000002,
"calib/std_conf": 0.009464755528920028,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9079537237888646,
"calib/step_q_c_n": 1383.0,
"calib/step_q_gap": -0.009917719836003758,
"calib/step_q_w": 0.9178714436248684,
"calib/step_q_w_n": 949.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 773.4140625,
"completions/mean_terminated_length": 814.7901000976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 442.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.016173789277672768,
"kl": 0.0373992919921875,
"learning_rate": 2.944444444444445e-06,
"loss": -0.167,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.01777365803718567,
"mask/share_reasoning": 0.8225142955780029,
"mask/share_step_conf": 0.108930803835392,
"num_tokens": 28331579.0,
"reward": 0.7784872651100159,
"reward_std": 0.23750649392604828,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6359518766403198,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.6077413558959961,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.5023649334907532,
"adv/mean_abs_reasoning": 0.4079497456550598,
"adv/mean_abs_step_conf": 0.7471914291381836,
"adv/ratio_final_to_reasoning": 1.2314382809188604,
"adv/ratio_step_to_reasoning": 1.8315771417834592,
"adv/std_final_conf": 0.7587032914161682,
"adv/std_reasoning": 0.7014654874801636,
"adv/std_step_conf": 0.933836042881012,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5303065134099616,
"calib/avg_num_step_conf": 8.8125,
"calib/ece": 0.288714859437751,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.001397701149425279,
"calib/mean_conf": 0.9875100401606426,
"calib/mu_c": 0.9879310344827585,
"calib/mu_w": 0.9865333333333333,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.288714859437751,
"calib/std_conf": 0.008372892026171202,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9147102425876011,
"calib/step_q_c_n": 1484.0,
"calib/step_q_gap": 0.016135113053922345,
"calib/step_q_w": 0.8985751295336788,
"calib/step_q_w_n": 772.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1592.0,
"completions/max_terminated_length": 1592.0,
"completions/mean_length": 765.5234375,
"completions/mean_terminated_length": 774.600830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.021871555596590042,
"kl": 0.04198455810546875,
"learning_rate": 2.916666666666667e-06,
"loss": -0.0602,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.018963484093546867,
"mask/share_reasoning": 0.8512589931488037,
"mask/share_step_conf": 0.11805877089500427,
"num_tokens": 28633681.0,
"reward": 0.8162245750427246,
"reward_std": 0.23270940780639648,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.6833745837211609,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.6201682090759277,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.5089430212974548,
"adv/mean_abs_reasoning": 0.45139145851135254,
"adv/mean_abs_step_conf": 0.7414897680282593,
"adv/ratio_final_to_reasoning": 1.127498120978855,
"adv/ratio_step_to_reasoning": 1.6426756732917018,
"adv/std_final_conf": 0.7677562236785889,
"adv/std_reasoning": 0.7394993305206299,
"adv/std_step_conf": 0.9327307343482971,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.49949494949494955,
"calib/avg_num_step_conf": 9.4921875,
"calib/ece": 0.25711382113821135,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0002525252525252819,
"calib/mean_conf": 0.9888211382113821,
"calib/mu_c": 0.9888888888888889,
"calib/mu_w": 0.9886363636363636,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.25711382113821135,
"calib/std_conf": 0.004667013669430779,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.918825857519789,
"calib/step_q_c_n": 1516.0,
"calib/step_q_gap": -0.008187271583055655,
"calib/step_q_w": 0.9270131291028446,
"calib/step_q_w_n": 914.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2116.0,
"completions/max_terminated_length": 2116.0,
"completions/mean_length": 711.53515625,
"completions/mean_terminated_length": 737.4615478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.1024,
"grad_norm": 0.017186082899570465,
"kl": 0.0482330322265625,
"learning_rate": 2.888888888888889e-06,
"loss": -0.258,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019098231568932533,
"mask/share_reasoning": 0.830284595489502,
"mask/share_step_conf": 0.11546093225479126,
"num_tokens": 28921650.0,
"reward": 0.8585371375083923,
"reward_std": 0.24619776010513306,
"rewards/accuracy_reward_step": 0.703125,
"rewards/final_brier_reward_step": 0.7048609256744385,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.6809632778167725,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.5731841325759888,
"adv/mean_abs_reasoning": 0.48445412516593933,
"adv/mean_abs_step_conf": 0.7737342715263367,
"adv/ratio_final_to_reasoning": 1.1831546121723227,
"adv/ratio_step_to_reasoning": 1.5971259843464243,
"adv/std_final_conf": 0.7974430322647095,
"adv/std_reasoning": 0.7393472790718079,
"adv/std_step_conf": 0.9335328936576843,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5352779269602578,
"calib/avg_num_step_conf": 8.61328125,
"calib/ece": 0.38015999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0013372717508056686,
"calib/mean_conf": 0.9881599999999999,
"calib/mu_c": 0.9886842105263158,
"calib/mu_w": 0.9873469387755102,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.38015999999999994,
"calib/std_conf": 0.006372942805329425,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9205419847328243,
"calib/step_q_c_n": 1310.0,
"calib/step_q_gap": 0.011938632777517055,
"calib/step_q_w": 0.9086033519553073,
"calib/step_q_w_n": 895.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2486.0,
"completions/max_terminated_length": 2486.0,
"completions/mean_length": 775.078125,
"completions/mean_terminated_length": 781.1810913085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.020701313391327858,
"kl": 0.0483245849609375,
"learning_rate": 2.861111111111111e-06,
"loss": -0.0614,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.018780209124088287,
"mask/share_reasoning": 0.8552939891815186,
"mask/share_step_conf": 0.11811327934265137,
"num_tokens": 29225142.0,
"reward": 0.7515037059783936,
"reward_std": 0.26109981536865234,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6032609343528748,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": 0.5849027633666992,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.4930378794670105,
"adv/mean_abs_reasoning": 0.4306323528289795,
"adv/mean_abs_step_conf": 0.7536168694496155,
"adv/ratio_final_to_reasoning": 1.144916019960104,
"adv/ratio_step_to_reasoning": 1.750023806847846,
"adv/std_final_conf": 0.7348681092262268,
"adv/std_reasoning": 0.7015235424041748,
"adv/std_step_conf": 0.9335595965385437,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.4756439393939394,
"calib/avg_num_step_conf": 9.06640625,
"calib/ece": 0.358361344537815,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.001118181818181907,
"calib/mean_conf": 0.9886134453781511,
"calib/mu_c": 0.9882,
"calib/mu_w": 0.9893181818181819,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.358361344537815,
"calib/std_conf": 0.005666663897392195,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9185202492211839,
"calib/step_q_c_n": 1284.0,
"calib/step_q_gap": 0.005415138324366131,
"calib/step_q_w": 0.9131051108968178,
"calib/step_q_w_n": 1037.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2945.0,
"completions/max_terminated_length": 2945.0,
"completions/mean_length": 781.96484375,
"completions/mean_terminated_length": 820.4220581054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.01520277839154005,
"kl": 0.049041748046875,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.2201,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.017720902338624,
"mask/share_reasoning": 0.8247094750404358,
"mask/share_step_conf": 0.11069461703300476,
"num_tokens": 29531509.0,
"reward": 0.735116183757782,
"reward_std": 0.22515791654586792,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.5931308269500732,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.5739765167236328,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.6646596193313599,
"adv/mean_abs_reasoning": 0.5712630748748779,
"adv/mean_abs_step_conf": 0.7645260691642761,
"adv/ratio_final_to_reasoning": 1.1634913029814473,
"adv/ratio_step_to_reasoning": 1.3383082204844554,
"adv/std_final_conf": 0.8679842948913574,
"adv/std_reasoning": 0.8267807364463806,
"adv/std_step_conf": 0.934746265411377,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.4874451399381251,
"calib/avg_num_step_conf": 9.3359375,
"calib/ece": 0.5084745762711865,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.9957627118644068,
"calib/gap": 0.0009583423267860436,
"calib/mean_conf": 0.9872881355932204,
"calib/mu_c": 0.9877876106194691,
"calib/mu_w": 0.986829268292683,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5084745762711865,
"calib/std_conf": 0.013757033144205443,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9159288537549407,
"calib/step_q_c_n": 1012.0,
"calib/step_q_gap": 0.004034804408061032,
"calib/step_q_w": 0.9118940493468797,
"calib/step_q_w_n": 1378.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2584.0,
"completions/max_terminated_length": 2584.0,
"completions/mean_length": 776.68359375,
"completions/mean_terminated_length": 831.9288330078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 412.0,
"epoch": 0.1056,
"grad_norm": 0.01690530963242054,
"kl": 0.0462646484375,
"learning_rate": 2.805555555555556e-06,
"loss": -0.2449,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.016956226900219917,
"mask/share_reasoning": 0.8120033740997314,
"mask/share_step_conf": 0.10463409125804901,
"num_tokens": 29836140.0,
"reward": 0.5938348770141602,
"reward_std": 0.29406648874282837,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.4537390470504761,
"rewards/format_reward_step": 0.921875,
"rewards/step_l1_reward": 0.46127450466156006,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.5383015871047974,
"adv/mean_abs_reasoning": 0.42496341466903687,
"adv/mean_abs_step_conf": 0.7425418496131897,
"adv/ratio_final_to_reasoning": 1.2667010112482946,
"adv/ratio_step_to_reasoning": 1.7473077069269225,
"adv/std_final_conf": 0.7861893177032471,
"adv/std_reasoning": 0.7206922769546509,
"adv/std_step_conf": 0.9341355562210083,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.5274305555555555,
"calib/avg_num_step_conf": 9.19921875,
"calib/ece": 0.31941908713692957,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.995850622406639,
"calib/gap": 0.013182098765432304,
"calib/mean_conf": 0.9833195020746889,
"calib/mu_c": 0.9877500000000001,
"calib/mu_w": 0.9745679012345678,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31941908713692957,
"calib/std_conf": 0.06407154968151031,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9168573551263003,
"calib/step_q_c_n": 1346.0,
"calib/step_q_gap": 0.0012478407556363313,
"calib/step_q_w": 0.915609514370664,
"calib/step_q_w_n": 1009.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2059.0,
"completions/max_terminated_length": 2059.0,
"completions/mean_length": 759.0546875,
"completions/mean_terminated_length": 796.3851928710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 447.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.0173397958278656,
"kl": 0.05344390869140625,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.142,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018021564930677414,
"mask/share_reasoning": 0.8257356882095337,
"mask/share_step_conf": 0.10936775803565979,
"num_tokens": 30137866.0,
"reward": 0.7770144939422607,
"reward_std": 0.22662517428398132,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6368894577026367,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.6046394109725952,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.611314058303833,
"adv/mean_abs_reasoning": 0.5150396227836609,
"adv/mean_abs_step_conf": 0.7677444219589233,
"adv/ratio_final_to_reasoning": 1.1869262698660596,
"adv/ratio_step_to_reasoning": 1.490651180989641,
"adv/std_final_conf": 0.8419346213340759,
"adv/std_reasoning": 0.7928768992424011,
"adv/std_step_conf": 0.9352395534515381,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5257377049180328,
"calib/avg_num_step_conf": 8.94921875,
"calib/ece": 0.4697975708502024,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9838056680161943,
"calib/gap": 0.024233442622950796,
"calib/mean_conf": 0.9758704453441294,
"calib/mu_c": 0.9878399999999999,
"calib/mu_w": 0.9636065573770491,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4697975708502024,
"calib/std_conf": 0.1086163969073252,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.913648393194707,
"calib/step_q_c_n": 1058.0,
"calib/step_q_gap": 0.016827630826499296,
"calib/step_q_w": 0.8968207623682077,
"calib/step_q_w_n": 1233.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2176.0,
"completions/max_terminated_length": 2176.0,
"completions/mean_length": 817.125,
"completions/mean_terminated_length": 846.8988037109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 439.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.018264202401041985,
"kl": 0.05799102783203125,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.109,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.017344597727060318,
"mask/share_reasoning": 0.8403701782226562,
"mask/share_step_conf": 0.10712893307209015,
"num_tokens": 30454042.0,
"reward": 0.6558018922805786,
"reward_std": 0.2865288555622101,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5110242366790771,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.5099546909332275,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.5574395656585693,
"adv/mean_abs_reasoning": 0.496779203414917,
"adv/mean_abs_step_conf": 0.7435898184776306,
"adv/ratio_final_to_reasoning": 1.1221072899724185,
"adv/ratio_step_to_reasoning": 1.4968215524444446,
"adv/std_final_conf": 0.7952538132667542,
"adv/std_reasoning": 0.7576413154602051,
"adv/std_step_conf": 0.9338082075119019,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.4978512267541803,
"calib/avg_num_step_conf": 9.4609375,
"calib/ece": 0.31547717842323664,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0002555086732297651,
"calib/mean_conf": 0.987676348547718,
"calib/mu_c": 0.9875925925925927,
"calib/mu_w": 0.9878481012658225,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31547717842323664,
"calib/std_conf": 0.007594154551192266,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9166166439290586,
"calib/step_q_c_n": 1466.0,
"calib/step_q_gap": -0.004293397911945607,
"calib/step_q_w": 0.9209100418410042,
"calib/step_q_w_n": 956.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2329.0,
"completions/max_terminated_length": 2329.0,
"completions/mean_length": 729.296875,
"completions/mean_terminated_length": 755.8704833984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 349.0,
"epoch": 0.1088,
"grad_norm": 0.023663917556405067,
"kl": 0.060455322265625,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.1672,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018974870443344116,
"mask/share_reasoning": 0.8205289840698242,
"mask/share_step_conf": 0.12533991038799286,
"num_tokens": 30747438.0,
"reward": 0.7782399654388428,
"reward_std": 0.23757195472717285,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6362093687057495,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.606208086013794,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.470939040184021,
"adv/mean_abs_reasoning": 0.43253329396247864,
"adv/mean_abs_step_conf": 0.7381531000137329,
"adv/ratio_final_to_reasoning": 1.0887925779532568,
"adv/ratio_step_to_reasoning": 1.7065809969249817,
"adv/std_final_conf": 0.774346113204956,
"adv/std_reasoning": 0.73920077085495,
"adv/std_step_conf": 0.9338717460632324,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5595996038483305,
"calib/avg_num_step_conf": 9.515625,
"calib/ece": 0.3599591836734693,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9877551020408163,
"calib/gap": 0.023989813242784708,
"calib/mean_conf": 0.9803673469387755,
"calib/mu_c": 0.9894736842105265,
"calib/mu_w": 0.9654838709677418,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3599591836734693,
"calib/std_conf": 0.08927067161911918,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9253194959229057,
"calib/step_q_c_n": 1349.0,
"calib/step_q_gap": 0.011740839069179887,
"calib/step_q_w": 0.9135786568537259,
"calib/step_q_w_n": 1087.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2605.0,
"completions/max_terminated_length": 2605.0,
"completions/mean_length": 816.39453125,
"completions/mean_terminated_length": 839.3453369140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 343.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.024849504232406616,
"kl": 0.05157470703125,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0615,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.017709825187921524,
"mask/share_reasoning": 0.8464046120643616,
"mask/share_step_conf": 0.1085418090224266,
"num_tokens": 31060987.0,
"reward": 0.7577913403511047,
"reward_std": 0.19486621022224426,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6108323335647583,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.5938127040863037,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.5395331382751465,
"adv/mean_abs_reasoning": 0.4769881069660187,
"adv/mean_abs_step_conf": 0.7343798875808716,
"adv/ratio_final_to_reasoning": 1.131124928264896,
"adv/ratio_step_to_reasoning": 1.5396188644032374,
"adv/std_final_conf": 0.7932192087173462,
"adv/std_reasoning": 0.7576055526733398,
"adv/std_step_conf": 0.9342693090438843,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5226561980171668,
"calib/avg_num_step_conf": 9.2890625,
"calib/ece": 0.448170731707317,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0009295362299551746,
"calib/mean_conf": 0.9888211382113821,
"calib/mu_c": 0.9892481203007518,
"calib/mu_w": 0.9883185840707966,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.448170731707317,
"calib/std_conf": 0.004921386083299872,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9211541701769167,
"calib/step_q_c_n": 1187.0,
"calib/step_q_gap": 0.007031584114783995,
"calib/step_q_w": 0.9141225860621327,
"calib/step_q_w_n": 1191.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2850.0,
"completions/max_terminated_length": 2850.0,
"completions/mean_length": 752.5,
"completions/mean_terminated_length": 783.0894165039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.014635776169598103,
"kl": 0.0593109130859375,
"learning_rate": 2.666666666666667e-06,
"loss": -0.192,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.018321998417377472,
"mask/share_reasoning": 0.8238799571990967,
"mask/share_step_conf": 0.11873549222946167,
"num_tokens": 31360307.0,
"reward": 0.6815319061279297,
"reward_std": 0.2409968078136444,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5297003984451294,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5372697114944458,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.5429937839508057,
"adv/mean_abs_reasoning": 0.522207498550415,
"adv/mean_abs_step_conf": 0.7398031949996948,
"adv/ratio_final_to_reasoning": 1.0398046474975768,
"adv/ratio_step_to_reasoning": 1.4166843583313131,
"adv/std_final_conf": 0.7971596717834473,
"adv/std_reasoning": 0.7929220795631409,
"adv/std_step_conf": 0.9352218508720398,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5242816091954023,
"calib/avg_num_step_conf": 9.54296875,
"calib/ece": 0.3780497925311204,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.983402489626556,
"calib/gap": 0.02392887931034471,
"calib/mean_conf": 0.9797095435684647,
"calib/mu_c": 0.9892413793103448,
"calib/mu_w": 0.9653125000000001,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3780497925311204,
"calib/std_conf": 0.09072505699646372,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.919859375,
"calib/step_q_c_n": 1280.0,
"calib/step_q_gap": 0.013978033641444565,
"calib/step_q_w": 0.9058813413585555,
"calib/step_q_w_n": 1163.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2936.0,
"completions/max_terminated_length": 2936.0,
"completions/mean_length": 792.6015625,
"completions/mean_terminated_length": 828.187744140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 409.0,
"epoch": 0.112,
"grad_norm": 0.013355753384530544,
"kl": 0.0567626953125,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.1744,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.017631947994232178,
"mask/share_reasoning": 0.8259000778198242,
"mask/share_step_conf": 0.11349925398826599,
"num_tokens": 31668973.0,
"reward": 0.7236430644989014,
"reward_std": 0.2772770822048187,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.5842854976654053,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.5614380836486816,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.5538749098777771,
"adv/mean_abs_reasoning": 0.4785403609275818,
"adv/mean_abs_step_conf": 0.7471832036972046,
"adv/ratio_final_to_reasoning": 1.1574256950953314,
"adv/ratio_step_to_reasoning": 1.5613796968951525,
"adv/std_final_conf": 0.7899455428123474,
"adv/std_reasoning": 0.7393032312393188,
"adv/std_step_conf": 0.934104323387146,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5122705932565087,
"calib/avg_num_step_conf": 9.19140625,
"calib/ece": 0.3994190871369294,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.995850622406639,
"calib/gap": 0.0009332764262341975,
"calib/mean_conf": 0.9886307053941908,
"calib/mu_c": 0.9890140845070422,
"calib/mu_w": 0.988080808080808,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3994190871369294,
"calib/std_conf": 0.007301489866746601,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9207292539815591,
"calib/step_q_c_n": 1193.0,
"calib/step_q_gap": -0.0048052287770616875,
"calib/step_q_w": 0.9255344827586208,
"calib/step_q_w_n": 1160.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2674.0,
"completions/max_terminated_length": 2674.0,
"completions/mean_length": 717.5546875,
"completions/mean_terminated_length": 759.0661010742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 436.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.019834445789456367,
"kl": 0.06396484375,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.1487,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018569810315966606,
"mask/share_reasoning": 0.8146433234214783,
"mask/share_step_conf": 0.11209937185049057,
"num_tokens": 31957251.0,
"reward": 0.7071048021316528,
"reward_std": 0.21697908639907837,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5637343525886536,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.5512564182281494,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.5383345484733582,
"adv/mean_abs_reasoning": 0.47710686922073364,
"adv/mean_abs_step_conf": 0.7578780651092529,
"adv/ratio_final_to_reasoning": 1.1283311626861057,
"adv/ratio_step_to_reasoning": 1.5884870120338184,
"adv/std_final_conf": 0.7956221699714661,
"adv/std_reasoning": 0.7576800584793091,
"adv/std_step_conf": 0.9345592260360718,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.518074912891986,
"calib/avg_num_step_conf": 9.8984375,
"calib/ece": 0.3277822580645162,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0012979094076654984,
"calib/mean_conf": 0.9890725806451613,
"calib/mu_c": 0.9895121951219511,
"calib/mu_w": 0.9882142857142856,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3277822580645162,
"calib/std_conf": 0.005034835723201447,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.91822266934943,
"calib/step_q_c_n": 1491.0,
"calib/step_q_gap": -0.004576947141461729,
"calib/step_q_w": 0.9227996164908917,
"calib/step_q_w_n": 1043.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 1584.0,
"completions/max_terminated_length": 1584.0,
"completions/mean_length": 741.94140625,
"completions/mean_terminated_length": 756.7211303710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 360.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.014583291485905647,
"kl": 0.06496429443359375,
"learning_rate": 2.5833333333333337e-06,
"loss": -0.1273,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.01896454207599163,
"mask/share_reasoning": 0.8346059322357178,
"mask/share_step_conf": 0.12689821422100067,
"num_tokens": 32251804.0,
"reward": 0.7869763970375061,
"reward_std": 0.2739853858947754,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6443132162094116,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.60854572057724,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.5826718807220459,
"adv/mean_abs_reasoning": 0.5499783754348755,
"adv/mean_abs_step_conf": 0.7166726589202881,
"adv/ratio_final_to_reasoning": 1.0594450741109942,
"adv/ratio_step_to_reasoning": 1.3030924322317312,
"adv/std_final_conf": 0.8173876404762268,
"adv/std_reasoning": 0.8100540041923523,
"adv/std_step_conf": 0.9353122711181641,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.5079188663517643,
"calib/avg_num_step_conf": 9.76171875,
"calib/ece": 0.24537815126050422,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0007409465592294051,
"calib/mean_conf": 0.9890756302521009,
"calib/mu_c": 0.989265536723164,
"calib/mu_w": 0.9885245901639346,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24537815126050422,
"calib/std_conf": 0.004581951556374991,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9210409670920081,
"calib/step_q_c_n": 1489.0,
"calib/step_q_gap": -0.00631546855155618,
"calib/step_q_w": 0.9273564356435643,
"calib/step_q_w_n": 1010.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2806.0,
"completions/max_terminated_length": 2806.0,
"completions/mean_length": 764.2421875,
"completions/mean_terminated_length": 801.8278198242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 405.0,
"epoch": 0.1152,
"grad_norm": 0.013571220450103283,
"kl": 0.0575714111328125,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.1597,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.017903491854667664,
"mask/share_reasoning": 0.8209375143051147,
"mask/share_step_conf": 0.1142839789390564,
"num_tokens": 32550682.0,
"reward": 0.8342296481132507,
"reward_std": 0.30271947383880615,
"rewards/accuracy_reward_step": 0.69140625,
"rewards/final_brier_reward_step": 0.6967445015907288,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.6474959850311279,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.44212257862091064,
"adv/mean_abs_reasoning": 0.3431011438369751,
"adv/mean_abs_step_conf": 0.7551099061965942,
"adv/ratio_final_to_reasoning": 1.2886071252242333,
"adv/ratio_step_to_reasoning": 2.2008376240079968,
"adv/std_final_conf": 0.7278553247451782,
"adv/std_reasoning": 0.6403741240501404,
"adv/std_step_conf": 0.933077871799469,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5175289312457454,
"calib/avg_num_step_conf": 9.77734375,
"calib/ece": 0.4522633744855968,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9958847736625515,
"calib/gap": 0.004605854322668468,
"calib/mean_conf": 0.9872427983539095,
"calib/mu_c": 0.9893846153846154,
"calib/mu_w": 0.984778761061947,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4522633744855968,
"calib/std_conf": 0.02564819560608947,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9250763052208836,
"calib/step_q_c_n": 1245.0,
"calib/step_q_gap": 0.018216209831376506,
"calib/step_q_w": 0.9068600953895071,
"calib/step_q_w_n": 1258.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 789.09765625,
"completions/mean_terminated_length": 817.8502197265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 458.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.019535109400749207,
"kl": 0.06496429443359375,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.12,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01754792034626007,
"mask/share_reasoning": 0.8291429281234741,
"mask/share_step_conf": 0.1181529089808464,
"num_tokens": 32857291.0,
"reward": 0.6686311960220337,
"reward_std": 0.1786944568157196,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5204710960388184,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.5253850221633911,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.5592010021209717,
"adv/mean_abs_reasoning": 0.5262867212295532,
"adv/mean_abs_step_conf": 0.7592321634292603,
"adv/ratio_final_to_reasoning": 1.0625405877893357,
"adv/ratio_step_to_reasoning": 1.4426207859766653,
"adv/std_final_conf": 0.8077802062034607,
"adv/std_reasoning": 0.7928251028060913,
"adv/std_step_conf": 0.9346528053283691,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5193466648404328,
"calib/avg_num_step_conf": 9.7578125,
"calib/ece": 0.3857085020242914,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.001299821942199686,
"calib/mean_conf": 0.9889473684210526,
"calib/mu_c": 0.9894630872483222,
"calib/mu_w": 0.9881632653061225,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3857085020242914,
"calib/std_conf": 0.0056648342716980205,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.92643906020558,
"calib/step_q_c_n": 1362.0,
"calib/step_q_gap": 0.0037453982337489755,
"calib/step_q_w": 0.922693661971831,
"calib/step_q_w_n": 1136.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2880.0,
"completions/max_terminated_length": 2880.0,
"completions/mean_length": 720.6015625,
"completions/mean_terminated_length": 737.8960571289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.016404099762439728,
"kl": 0.06627655029296875,
"learning_rate": 2.5e-06,
"loss": -0.1404,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019600635394454002,
"mask/share_reasoning": 0.8245271444320679,
"mask/share_step_conf": 0.1324346661567688,
"num_tokens": 33146685.0,
"reward": 0.7375810146331787,
"reward_std": 0.2660372257232666,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.5909448862075806,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.5748422145843506,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.534350574016571,
"adv/mean_abs_reasoning": 0.46892502903938293,
"adv/mean_abs_step_conf": 0.7618857622146606,
"adv/ratio_final_to_reasoning": 1.1395223989455536,
"adv/ratio_step_to_reasoning": 1.6247496188792117,
"adv/std_final_conf": 0.783694863319397,
"adv/std_reasoning": 0.7576603293418884,
"adv/std_step_conf": 0.9344452619552612,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5555994824611845,
"calib/avg_num_step_conf": 9.7890625,
"calib/ece": 0.3735674931129478,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.987603305785124,
"calib/gap": 0.010952175579835521,
"calib/mean_conf": 0.985137741046832,
"calib/mu_c": 0.9893918918918919,
"calib/mu_w": 0.9784397163120564,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3735674931129478,
"calib/std_conf": 0.04316458283197865,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9178399378399378,
"calib/step_q_c_n": 1287.0,
"calib/step_q_gap": -0.0005193730706446464,
"calib/step_q_w": 0.9183593109105824,
"calib/step_q_w_n": 1219.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2763.0,
"completions/max_terminated_length": 2763.0,
"completions/mean_length": 753.44140625,
"completions/mean_terminated_length": 793.7489624023438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 398.0,
"epoch": 0.1184,
"grad_norm": 0.016747089102864265,
"kl": 0.06616973876953125,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.2023,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.018317745998501778,
"mask/share_reasoning": 0.8140534162521362,
"mask/share_step_conf": 0.11684764921665192,
"num_tokens": 33446974.0,
"reward": 0.7371137142181396,
"reward_std": 0.25048425793647766,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.591988205909729,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.5775517225265503,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.5227197408676147,
"adv/mean_abs_reasoning": 0.4496156573295593,
"adv/mean_abs_step_conf": 0.7582208514213562,
"adv/ratio_final_to_reasoning": 1.1625923882905875,
"adv/ratio_step_to_reasoning": 1.686375550008027,
"adv/std_final_conf": 0.7749086022377014,
"adv/std_reasoning": 0.7393931746482849,
"adv/std_step_conf": 0.9344673752784729,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.5401295409743734,
"calib/avg_num_step_conf": 9.83203125,
"calib/ece": 0.42933333333333346,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.9958333333333333,
"calib/gap": 0.002410588566600813,
"calib/mean_conf": 0.9876666666666668,
"calib/mu_c": 0.9887313432835819,
"calib/mu_w": 0.9863207547169811,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.42933333333333346,
"calib/std_conf": 0.009285592184789415,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9196293176074136,
"calib/step_q_c_n": 1187.0,
"calib/step_q_gap": 0.012482701066060198,
"calib/step_q_w": 0.9071466165413534,
"calib/step_q_w_n": 1330.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2106.0,
"completions/max_terminated_length": 2106.0,
"completions/mean_length": 775.73828125,
"completions/mean_terminated_length": 820.6156616210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 472.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.02123652584850788,
"kl": 0.06374359130859375,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.2496,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.017327800393104553,
"mask/share_reasoning": 0.8179709911346436,
"mask/share_step_conf": 0.11001374572515488,
"num_tokens": 33753483.0,
"reward": 0.678582489490509,
"reward_std": 0.20910188555717468,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5304815769195557,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.536058247089386,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.6130001544952393,
"adv/mean_abs_reasoning": 0.5337440967559814,
"adv/mean_abs_step_conf": 0.7346939444541931,
"adv/ratio_final_to_reasoning": 1.1484907434498377,
"adv/ratio_step_to_reasoning": 1.3764909980635953,
"adv/std_final_conf": 0.8501105308532715,
"adv/std_reasoning": 0.8098631501197815,
"adv/std_step_conf": 0.934903621673584,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.513346747149564,
"calib/avg_num_step_conf": 10.6875,
"calib/ece": 0.41344129554655873,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0005915492957746515,
"calib/mean_conf": 0.9883400809716599,
"calib/mu_c": 0.9885915492957746,
"calib/mu_w": 0.988,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.41344129554655873,
"calib/std_conf": 0.00742559126679215,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9231344410876133,
"calib/step_q_c_n": 1324.0,
"calib/step_q_gap": -0.005417258629100585,
"calib/step_q_w": 0.9285516997167139,
"calib/step_q_w_n": 1412.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2053.0,
"completions/max_terminated_length": 2053.0,
"completions/mean_length": 708.453125,
"completions/mean_terminated_length": 734.2672119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 365.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.016310779377818108,
"kl": 0.07384490966796875,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.1681,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.019763082265853882,
"mask/share_reasoning": 0.813399076461792,
"mask/share_step_conf": 0.1316816210746765,
"num_tokens": 34040047.0,
"reward": 0.7140007019042969,
"reward_std": 0.2736779451370239,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5643468499183655,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.5597482919692993,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.5179460048675537,
"adv/mean_abs_reasoning": 0.40910542011260986,
"adv/mean_abs_step_conf": 0.7549639940261841,
"adv/ratio_final_to_reasoning": 1.2660453257377635,
"adv/ratio_step_to_reasoning": 1.8454020819826187,
"adv/std_final_conf": 0.7732772827148438,
"adv/std_reasoning": 0.6816585659980774,
"adv/std_step_conf": 0.933284342288971,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5517870439314967,
"calib/avg_num_step_conf": 10.19921875,
"calib/ece": 0.33880658436214,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9917695473251029,
"calib/gap": 0.0025532390171258967,
"calib/mean_conf": 0.9877777777777779,
"calib/mu_c": 0.9886708860759492,
"calib/mu_w": 0.9861176470588233,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33818930041152273,
"calib/std_conf": 0.012406025351440173,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9254854368932038,
"calib/step_q_c_n": 1442.0,
"calib/step_q_gap": 0.012525642196882236,
"calib/step_q_w": 0.9129597946963216,
"calib/step_q_w_n": 1169.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2101.0,
"completions/max_terminated_length": 2101.0,
"completions/mean_length": 712.6328125,
"completions/mean_terminated_length": 738.5991821289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 378.0,
"epoch": 0.1216,
"grad_norm": 0.023121921345591545,
"kl": 0.0703887939453125,
"learning_rate": 2.388888888888889e-06,
"loss": -0.1215,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.019264452159404755,
"mask/share_reasoning": 0.817579448223114,
"mask/share_step_conf": 0.12799984216690063,
"num_tokens": 34327505.0,
"reward": 0.7811685800552368,
"reward_std": 0.19103579223155975,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6261183023452759,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6221563220024109,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.5585366487503052,
"adv/mean_abs_reasoning": 0.49713629484176636,
"adv/mean_abs_step_conf": 0.7559834718704224,
"adv/ratio_final_to_reasoning": 1.1235080893220277,
"adv/ratio_step_to_reasoning": 1.520676481911353,
"adv/std_final_conf": 0.8085088729858398,
"adv/std_reasoning": 0.7576323747634888,
"adv/std_step_conf": 0.934259831905365,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.4924181427804616,
"calib/avg_num_step_conf": 9.1875,
"calib/ece": 0.43219512195121956,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.991869918699187,
"calib/gap": -0.008107890499194692,
"calib/mean_conf": 0.9834146341463416,
"calib/mu_c": 0.9798550724637681,
"calib/mu_w": 0.9879629629629628,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.42731707317073175,
"calib/std_conf": 0.06447825962837048,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9207207953603976,
"calib/step_q_c_n": 1207.0,
"calib/step_q_gap": 0.01630158138659854,
"calib/step_q_w": 0.9044192139737991,
"calib/step_q_w_n": 1145.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 769.578125,
"completions/mean_terminated_length": 794.4031982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 408.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.019223693758249283,
"kl": 0.0823211669921875,
"learning_rate": 2.361111111111111e-06,
"loss": -0.1441,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.0183610450476408,
"mask/share_reasoning": 0.829988420009613,
"mask/share_step_conf": 0.12040051817893982,
"num_tokens": 34629781.0,
"reward": 0.689035952091217,
"reward_std": 0.2614095211029053,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5410534739494324,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.537799596786499,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.6313544511795044,
"adv/mean_abs_reasoning": 0.6060963869094849,
"adv/mean_abs_step_conf": 0.7450257539749146,
"adv/ratio_final_to_reasoning": 1.0416733457178513,
"adv/ratio_step_to_reasoning": 1.2292199228803151,
"adv/std_final_conf": 0.859879195690155,
"adv/std_reasoning": 0.8429909348487854,
"adv/std_step_conf": 0.9344965219497681,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5225281602002503,
"calib/avg_num_step_conf": 10.37890625,
"calib/ece": 0.40823045267489727,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9917695473251029,
"calib/gap": 0.003289528577388734,
"calib/mean_conf": 0.9884773662551442,
"calib/mu_c": 0.9898581560283688,
"calib/mu_w": 0.9865686274509801,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40823045267489727,
"calib/std_conf": 0.013719204273677617,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9222453703703704,
"calib/step_q_c_n": 1296.0,
"calib/step_q_gap": 0.006110175660598016,
"calib/step_q_w": 0.9161351947097723,
"calib/step_q_w_n": 1361.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2961.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 792.05078125,
"completions/mean_terminated_length": 824.2479248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.01399253774434328,
"kl": 0.0571746826171875,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.116,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.017849216237664223,
"mask/share_reasoning": 0.821422278881073,
"mask/share_step_conf": 0.12166602164506912,
"num_tokens": 34937066.0,
"reward": 0.7141497135162354,
"reward_std": 0.3195981979370117,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5611796379089355,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.5671197175979614,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.64402174949646,
"adv/mean_abs_reasoning": 0.5441372394561768,
"adv/mean_abs_step_conf": 0.7752643823623657,
"adv/ratio_final_to_reasoning": 1.183564922224603,
"adv/ratio_step_to_reasoning": 1.4247589140143813,
"adv/std_final_conf": 0.8416389226913452,
"adv/std_reasoning": 0.7930080890655518,
"adv/std_step_conf": 0.9348081946372986,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5257218718884833,
"calib/avg_num_step_conf": 9.71484375,
"calib/ece": 0.44333333333333347,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.9878048780487805,
"calib/gap": 0.02748489877198812,
"calib/mean_conf": 0.9758536585365853,
"calib/mu_c": 0.9887022900763359,
"calib/mu_w": 0.9612173913043478,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.44333333333333347,
"calib/std_conf": 0.1087731048571918,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9253018372703411,
"calib/step_q_c_n": 1143.0,
"calib/step_q_gap": 0.02639558727034097,
"calib/step_q_w": 0.8989062500000001,
"calib/step_q_w_n": 1344.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2672.0,
"completions/max_terminated_length": 2672.0,
"completions/mean_length": 747.90234375,
"completions/mean_terminated_length": 772.0281982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.1248,
"grad_norm": 0.015917358919978142,
"kl": 0.0683746337890625,
"learning_rate": 2.305555555555556e-06,
"loss": -0.088,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.018997523933649063,
"mask/share_reasoning": 0.8268004655838013,
"mask/share_step_conf": 0.12295202910900116,
"num_tokens": 35235129.0,
"reward": 0.6883047819137573,
"reward_std": 0.29987388849258423,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5306491851806641,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.5529916286468506,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.538367509841919,
"adv/mean_abs_reasoning": 0.4627258777618408,
"adv/mean_abs_step_conf": 0.7386770248413086,
"adv/ratio_final_to_reasoning": 1.1634696387544807,
"adv/ratio_step_to_reasoning": 1.5963598759901134,
"adv/std_final_conf": 0.8059641718864441,
"adv/std_reasoning": 0.7574905753135681,
"adv/std_step_conf": 0.9340482950210571,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5306023763470572,
"calib/avg_num_step_conf": 9.60546875,
"calib/ece": 0.3664112903225807,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9959677419354839,
"calib/gap": 0.002460624481900986,
"calib/mean_conf": 0.9873790322580646,
"calib/mu_c": 0.9883116883116883,
"calib/mu_w": 0.9858510638297873,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3664112903225807,
"calib/std_conf": 0.009587536215127371,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9131305536568695,
"calib/step_q_c_n": 1463.0,
"calib/step_q_gap": 0.008920379627418318,
"calib/step_q_w": 0.9042101740294511,
"calib/step_q_w_n": 996.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 1822.0,
"completions/max_terminated_length": 1822.0,
"completions/mean_length": 757.5,
"completions/mean_terminated_length": 775.6800537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 424.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.015437428839504719,
"kl": 0.06353759765625,
"learning_rate": 2.277777777777778e-06,
"loss": -0.1185,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018956642597913742,
"mask/share_reasoning": 0.831961989402771,
"mask/share_step_conf": 0.12564381957054138,
"num_tokens": 35533057.0,
"reward": 0.7535146474838257,
"reward_std": 0.23272234201431274,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6116320490837097,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.58133465051651,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.538609504699707,
"adv/mean_abs_reasoning": 0.47553685307502747,
"adv/mean_abs_step_conf": 0.7550853490829468,
"adv/ratio_final_to_reasoning": 1.132634623829519,
"adv/ratio_step_to_reasoning": 1.5878587415470273,
"adv/std_final_conf": 0.7883803844451904,
"adv/std_reasoning": 0.7576671242713928,
"adv/std_step_conf": 0.9337403774261475,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.5490570112294846,
"calib/avg_num_step_conf": 8.828125,
"calib/ece": 0.3654732510288067,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.9917695473251029,
"calib/gap": 0.0033630866685860905,
"calib/mean_conf": 0.9868724279835391,
"calib/mu_c": 0.9881456953642382,
"calib/mu_w": 0.9847826086956522,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3654732510288067,
"calib/std_conf": 0.012107720433875898,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9222435897435898,
"calib/step_q_c_n": 1248.0,
"calib/step_q_gap": 0.002688253775210425,
"calib/step_q_w": 0.9195553359683794,
"calib/step_q_w_n": 1012.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 800.87890625,
"completions/mean_terminated_length": 830.0607299804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.02127382531762123,
"kl": 0.0583038330078125,
"learning_rate": 2.25e-06,
"loss": -0.1025,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.018054481595754623,
"mask/share_reasoning": 0.8387048244476318,
"mask/share_step_conf": 0.10808445513248444,
"num_tokens": 35843146.0,
"reward": 0.7453292608261108,
"reward_std": 0.24058867990970612,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6004007458686829,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.5832265615463257,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.5529657602310181,
"adv/mean_abs_reasoning": 0.5008734464645386,
"adv/mean_abs_step_conf": 0.7641829252243042,
"adv/ratio_final_to_reasoning": 1.104002945522822,
"adv/ratio_step_to_reasoning": 1.5257006148326686,
"adv/std_final_conf": 0.8203580975532532,
"adv/std_reasoning": 0.7754554748535156,
"adv/std_step_conf": 0.9351306557655334,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.551783659378596,
"calib/avg_num_step_conf": 10.9765625,
"calib/ece": 0.326137339055794,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.9828326180257511,
"calib/gap": 0.001138418543481734,
"calib/mean_conf": 0.9870815450643777,
"calib/mu_c": 0.9874675324675324,
"calib/mu_w": 0.9863291139240506,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.326137339055794,
"calib/std_conf": 0.012604149913328892,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9201223902087833,
"calib/step_q_c_n": 1389.0,
"calib/step_q_gap": -0.008723492972075308,
"calib/step_q_w": 0.9288458831808586,
"calib/step_q_w_n": 1421.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08203125,
"completions/max_length": 2599.0,
"completions/max_terminated_length": 2599.0,
"completions/mean_length": 715.015625,
"completions/mean_terminated_length": 778.9105834960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 421.0,
"epoch": 0.128,
"grad_norm": 0.01677713543176651,
"kl": 0.064300537109375,
"learning_rate": 2.222222222222222e-06,
"loss": -0.2494,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.017968494445085526,
"mask/share_reasoning": 0.7828407883644104,
"mask/share_step_conf": 0.11715947091579437,
"num_tokens": 36132878.0,
"reward": 0.7485262751579285,
"reward_std": 0.22463907301425934,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6096257567405701,
"rewards/format_reward_step": 0.90625,
"rewards/step_l1_reward": 0.585864245891571,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.6709598302841187,
"adv/mean_abs_reasoning": 0.5795350074768066,
"adv/mean_abs_step_conf": 0.7495955228805542,
"adv/ratio_final_to_reasoning": 1.1577554791821112,
"adv/ratio_step_to_reasoning": 1.293443041765779,
"adv/std_final_conf": 0.868781328201294,
"adv/std_reasoning": 0.8101203441619873,
"adv/std_step_conf": 0.9349991083145142,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5046420083184788,
"calib/avg_num_step_conf": 9.1796875,
"calib/ece": 0.35302904564315374,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.995850622406639,
"calib/gap": -0.0010063874034462739,
"calib/mean_conf": 0.9878838174273861,
"calib/mu_c": 0.9875163398692811,
"calib/mu_w": 0.9885227272727274,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.35302904564315374,
"calib/std_conf": 0.009022964451993423,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9181722054380665,
"calib/step_q_c_n": 1324.0,
"calib/step_q_gap": 0.012295012455610377,
"calib/step_q_w": 0.9058771929824562,
"calib/step_q_w_n": 1026.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2754.0,
"completions/max_terminated_length": 2754.0,
"completions/mean_length": 780.52734375,
"completions/mean_terminated_length": 818.9138793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 413.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.012640361674129963,
"kl": 0.059722900390625,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.2439,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.01739826798439026,
"mask/share_reasoning": 0.8256901502609253,
"mask/share_step_conf": 0.11003653705120087,
"num_tokens": 36437749.0,
"reward": 0.7456340789794922,
"reward_std": 0.30374008417129517,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6053320169448853,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.5781236886978149,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.5027682185173035,
"adv/mean_abs_reasoning": 0.44970449805259705,
"adv/mean_abs_step_conf": 0.7147163152694702,
"adv/ratio_final_to_reasoning": 1.1179968639284104,
"adv/ratio_step_to_reasoning": 1.5893021269844574,
"adv/std_final_conf": 0.7782015800476074,
"adv/std_reasoning": 0.7575188875198364,
"adv/std_step_conf": 0.9345012903213501,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.49660633484162897,
"calib/avg_num_step_conf": 9.2578125,
"calib/ece": 0.33995850622406654,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.9875518672199171,
"calib/gap": 0.0012217194570136591,
"calib/mean_conf": 0.9872614107883818,
"calib/mu_c": 0.9876923076923075,
"calib/mu_w": 0.9864705882352939,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.33995850622406654,
"calib/std_conf": 0.015351014823162748,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9221030042918454,
"calib/step_q_c_n": 1398.0,
"calib/step_q_gap": 0.014726461081968845,
"calib/step_q_w": 0.9073765432098766,
"calib/step_q_w_n": 972.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2789.0,
"completions/max_terminated_length": 2789.0,
"completions/mean_length": 767.0546875,
"completions/mean_terminated_length": 801.4938354492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.015725623816251755,
"kl": 0.06002044677734375,
"learning_rate": 2.166666666666667e-06,
"loss": -0.1633,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018181851133704185,
"mask/share_reasoning": 0.8258481025695801,
"mask/share_step_conf": 0.1130012795329094,
"num_tokens": 36741459.0,
"reward": 0.7598967552185059,
"reward_std": 0.19657093286514282,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6178292632102966,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l1_reward": 0.5933704376220703,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.624484658241272,
"adv/mean_abs_reasoning": 0.5451442003250122,
"adv/mean_abs_step_conf": 0.7624143362045288,
"adv/ratio_final_to_reasoning": 1.145540313680229,
"adv/ratio_step_to_reasoning": 1.3985553469155156,
"adv/std_final_conf": 0.8395637273788452,
"adv/std_reasoning": 0.7929816246032715,
"adv/std_step_conf": 0.934124767780304,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5196058374249864,
"calib/avg_num_step_conf": 9.15625,
"calib/ece": 0.40697959183673466,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9959183673469387,
"calib/gap": 0.009171440261865671,
"calib/mean_conf": 0.9824897959183673,
"calib/mu_c": 0.9863829787234042,
"calib/mu_w": 0.9772115384615385,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40697959183673466,
"calib/std_conf": 0.06380394794546601,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9132793522267207,
"calib/step_q_c_n": 1235.0,
"calib/step_q_gap": -0.001919024689419846,
"calib/step_q_w": 0.9151983769161406,
"calib/step_q_w_n": 1109.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2921.0,
"completions/max_terminated_length": 2921.0,
"completions/mean_length": 820.36328125,
"completions/mean_terminated_length": 846.8265991210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.1312,
"grad_norm": 0.05354432389140129,
"kl": 0.08370208740234375,
"learning_rate": 2.138888888888889e-06,
"loss": -0.1112,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.0171657707542181,
"mask/share_reasoning": 0.839978039264679,
"mask/share_step_conf": 0.11160621047019958,
"num_tokens": 37056760.0,
"reward": 0.7095881700515747,
"reward_std": 0.29550105333328247,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5651074051856995,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.5525063276290894,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.5212051868438721,
"adv/mean_abs_reasoning": 0.4511791467666626,
"adv/mean_abs_step_conf": 0.7597951889038086,
"adv/ratio_final_to_reasoning": 1.1552067301404447,
"adv/ratio_step_to_reasoning": 1.6840210686792971,
"adv/std_final_conf": 0.7759954929351807,
"adv/std_reasoning": 0.7207808494567871,
"adv/std_step_conf": 0.9350464344024658,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.5416800385109115,
"calib/avg_num_step_conf": 10.3046875,
"calib/ece": 0.31208333333333327,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.9791666666666666,
"calib/gap": -0.008183568677792019,
"calib/mean_conf": 0.9787500000000001,
"calib/mu_c": 0.9761585365853657,
"calib/mu_w": 0.9843421052631577,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3037499999999999,
"calib/std_conf": 0.09053141719867197,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9168852459016393,
"calib/step_q_c_n": 1464.0,
"calib/step_q_gap": 0.006482349819867506,
"calib/step_q_w": 0.9104028960817718,
"calib/step_q_w_n": 1174.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2941.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 804.55859375,
"completions/mean_terminated_length": 844.1270141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.03308064118027687,
"kl": 0.058441162109375,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.1693,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.017353367060422897,
"mask/share_reasoning": 0.8213303089141846,
"mask/share_step_conf": 0.11444129049777985,
"num_tokens": 37369543.0,
"reward": 0.7908234596252441,
"reward_std": 0.24148871004581451,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6418148279190063,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.6242070198059082,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.6522349119186401,
"adv/mean_abs_reasoning": 0.5776388645172119,
"adv/mean_abs_step_conf": 0.7451116442680359,
"adv/ratio_final_to_reasoning": 1.1291395921979301,
"adv/ratio_step_to_reasoning": 1.2899264402695567,
"adv/std_final_conf": 0.8551003932952881,
"adv/std_reasoning": 0.8099701404571533,
"adv/std_step_conf": 0.9350612759590149,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5327742837176799,
"calib/avg_num_step_conf": 9.19921875,
"calib/ece": 0.4260995850622408,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.9875518672199171,
"calib/gap": 0.0036065688329837142,
"calib/mean_conf": 0.9862655601659752,
"calib/mu_c": 0.9878518518518518,
"calib/mu_w": 0.984245283018868,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4260995850622408,
"calib/std_conf": 0.01645300161157873,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9197979797979798,
"calib/step_q_c_n": 1188.0,
"calib/step_q_gap": 0.0045280569188023145,
"calib/step_q_w": 0.9152699228791775,
"calib/step_q_w_n": 1167.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2966.0,
"completions/max_terminated_length": 2966.0,
"completions/mean_length": 856.3984375,
"completions/mean_terminated_length": 880.473876953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 326.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.024113211780786514,
"kl": 0.0589752197265625,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.053,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.017093347385525703,
"mask/share_reasoning": 0.844731330871582,
"mask/share_step_conf": 0.11083149909973145,
"num_tokens": 37693589.0,
"reward": 0.6838798522949219,
"reward_std": 0.27375972270965576,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5360523462295532,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.5387386083602905,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.519602358341217,
"adv/mean_abs_reasoning": 0.4709121584892273,
"adv/mean_abs_step_conf": 0.7496457695960999,
"adv/ratio_final_to_reasoning": 1.1033955037563627,
"adv/ratio_step_to_reasoning": 1.5919014960265652,
"adv/std_final_conf": 0.7761543989181519,
"adv/std_reasoning": 0.7577767968177795,
"adv/std_step_conf": 0.9334598183631897,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.531223083548665,
"calib/avg_num_step_conf": 9.9765625,
"calib/ece": 0.43189873417721514,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.9662447257383966,
"calib/gap": 0.021431955211025033,
"calib/mean_conf": 0.9762025316455696,
"calib/mu_c": 0.985968992248062,
"calib/mu_w": 0.964537037037037,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.43189873417721514,
"calib/std_conf": 0.09223393205018894,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9176877133105804,
"calib/step_q_c_n": 1172.0,
"calib/step_q_gap": 0.01070507944661514,
"calib/step_q_w": 0.9069826338639653,
"calib/step_q_w_n": 1382.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 3008.0,
"completions/max_terminated_length": 3008.0,
"completions/mean_length": 799.23046875,
"completions/mean_terminated_length": 838.536865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 463.0,
"epoch": 0.1344,
"grad_norm": 0.025906076654791832,
"kl": 0.06085205078125,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.1235,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.01753048598766327,
"mask/share_reasoning": 0.8175128102302551,
"mask/share_step_conf": 0.11808168888092041,
"num_tokens": 38003656.0,
"reward": 0.6730961799621582,
"reward_std": 0.24437865614891052,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5240218639373779,
"rewards/format_reward_step": 0.921875,
"rewards/step_l1_reward": 0.5370141267776489,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.5989837646484375,
"adv/mean_abs_reasoning": 0.4840828776359558,
"adv/mean_abs_step_conf": 0.7649731636047363,
"adv/ratio_final_to_reasoning": 1.2373578829592284,
"adv/ratio_step_to_reasoning": 1.5802524711068546,
"adv/std_final_conf": 0.8067528605461121,
"adv/std_reasoning": 0.739617645740509,
"adv/std_step_conf": 0.9335212707519531,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.5324467698568418,
"calib/avg_num_step_conf": 9.9765625,
"calib/ece": 0.39130252100840346,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.9831932773109243,
"calib/gap": 0.024702419882276172,
"calib/mean_conf": 0.9753361344537815,
"calib/mu_c": 0.9856115107913669,
"calib/mu_w": 0.9609090909090907,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39130252100840346,
"calib/std_conf": 0.09605939840553002,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9170251177394034,
"calib/step_q_c_n": 1274.0,
"calib/step_q_gap": 0.010358451072736807,
"calib/step_q_w": 0.9066666666666666,
"calib/step_q_w_n": 1280.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2848.0,
"completions/max_terminated_length": 2848.0,
"completions/mean_length": 732.9296875,
"completions/mean_terminated_length": 775.33056640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 403.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.02946789562702179,
"kl": 0.079742431640625,
"learning_rate": 2.027777777777778e-06,
"loss": -0.1981,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.01851074770092964,
"mask/share_reasoning": 0.8075978755950928,
"mask/share_step_conf": 0.1192038431763649,
"num_tokens": 38294958.0,
"reward": 0.7024646401405334,
"reward_std": 0.26939183473587036,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5640590190887451,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.5463390350341797,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.6941781044006348,
"adv/mean_abs_reasoning": 0.5947922468185425,
"adv/mean_abs_step_conf": 0.7895233631134033,
"adv/ratio_final_to_reasoning": 1.1670933979279199,
"adv/ratio_step_to_reasoning": 1.3273935014056577,
"adv/std_final_conf": 0.8686315417289734,
"adv/std_reasoning": 0.810073971748352,
"adv/std_step_conf": 0.9350660443305969,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.5875923413236847,
"calib/avg_num_step_conf": 10.046875,
"calib/ece": 0.39669527896995715,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.9613733905579399,
"calib/gap": 0.03229006482737817,
"calib/mean_conf": 0.9718025751072962,
"calib/mu_c": 0.9855223880597013,
"calib/mu_w": 0.9532323232323231,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.39669527896995715,
"calib/std_conf": 0.09675070656505114,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9161878009630819,
"calib/step_q_c_n": 1246.0,
"calib/step_q_gap": 0.009815251943474212,
"calib/step_q_w": 0.9063725490196077,
"calib/step_q_w_n": 1326.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2936.0,
"completions/max_terminated_length": 2936.0,
"completions/mean_length": 829.671875,
"completions/mean_terminated_length": 870.475341796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.02151694893836975,
"kl": 0.06743621826171875,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.1478,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.01694222167134285,
"mask/share_reasoning": 0.8211142420768738,
"mask/share_step_conf": 0.11506853997707367,
"num_tokens": 38614018.0,
"reward": 0.6881914138793945,
"reward_std": 0.3057379722595215,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5496238470077515,
"rewards/format_reward_step": 0.90625,
"rewards/step_l1_reward": 0.5408215522766113,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.5974356532096863,
"adv/mean_abs_reasoning": 0.42029517889022827,
"adv/mean_abs_step_conf": 0.7427824139595032,
"adv/ratio_final_to_reasoning": 1.4214668243095008,
"adv/ratio_step_to_reasoning": 1.7672874952332045,
"adv/std_final_conf": 0.8128259181976318,
"adv/std_reasoning": 0.7016498446464539,
"adv/std_step_conf": 0.9345721006393433,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.5926601488399241,
"calib/avg_num_step_conf": 11.4609375,
"calib/ece": 0.3499176954732511,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.9794238683127572,
"calib/gap": 0.009147818473660974,
"calib/mean_conf": 0.9836625514403293,
"calib/mu_c": 0.987012987012987,
"calib/mu_w": 0.977865168539326,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3499176954732511,
"calib/std_conf": 0.019461931736232917,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9127908496732027,
"calib/step_q_c_n": 1530.0,
"calib/step_q_gap": -0.018776101893748898,
"calib/step_q_w": 0.9315669515669516,
"calib/step_q_w_n": 1404.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 1959.0,
"completions/max_terminated_length": 1959.0,
"completions/mean_length": 739.125,
"completions/mean_terminated_length": 775.475341796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.1376,
"grad_norm": 0.021641433238983154,
"kl": 0.0746612548828125,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.2126,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.018588412553071976,
"mask/share_reasoning": 0.8041425943374634,
"mask/share_step_conf": 0.1303940713405609,
"num_tokens": 38905618.0,
"reward": 0.7644376754760742,
"reward_std": 0.22583582997322083,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6163402199745178,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6023787260055542,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.6023110151290894,
"adv/mean_abs_reasoning": 0.2895757555961609,
"adv/mean_abs_step_conf": 0.7641828060150146,
"adv/ratio_final_to_reasoning": 2.0799773582187093,
"adv/ratio_step_to_reasoning": 2.638973709804406,
"adv/std_final_conf": 0.825607419013977,
"adv/std_reasoning": 0.5727540850639343,
"adv/std_step_conf": 0.9316400289535522,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5526162790697675,
"calib/avg_num_step_conf": 10.53125,
"calib/ece": 0.34788617886178874,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.943089430894309,
"calib/gap": -0.01525726744186029,
"calib/mean_conf": 0.9641463414634147,
"calib/mu_c": 0.9588125,
"calib/mu_w": 0.9740697674418602,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3308130081300814,
"calib/std_conf": 0.12827768931519623,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9229144827586208,
"calib/step_q_c_n": 1450.0,
"calib/step_q_gap": 0.0019514008966624585,
"calib/step_q_w": 0.9209630818619583,
"calib/step_q_w_n": 1246.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2009.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 720.69921875,
"completions/mean_terminated_length": 746.9595336914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 394.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.03302701190114021,
"kl": 0.07592010498046875,
"learning_rate": 1.944444444444445e-06,
"loss": -0.1114,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.019100401550531387,
"mask/share_reasoning": 0.818557858467102,
"mask/share_step_conf": 0.12718549370765686,
"num_tokens": 39195405.0,
"reward": 0.7844294309616089,
"reward_std": 0.17043429613113403,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6253741979598999,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.6262969970703125,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.6709331274032593,
"adv/mean_abs_reasoning": 0.4085502624511719,
"adv/mean_abs_step_conf": 0.7450651526451111,
"adv/ratio_final_to_reasoning": 1.6422290941091886,
"adv/ratio_step_to_reasoning": 1.8236805140570873,
"adv/std_final_conf": 0.853084921836853,
"adv/std_reasoning": 0.7014339566230774,
"adv/std_step_conf": 0.9335004687309265,
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.5526536846124475,
"calib/avg_num_step_conf": 11.375,
"calib/ece": 0.5476724137931034,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.9439655172413793,
"calib/gap": 0.02887361588392534,
"calib/mean_conf": 0.9657758620689655,
"calib/mu_c": 0.9825773195876291,
"calib/mu_w": 0.9537037037037037,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.5476724137931034,
"calib/std_conf": 0.1129819245695047,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9186820809248556,
"calib/step_q_c_n": 865.0,
"calib/step_q_gap": 0.010453779345308223,
"calib/step_q_w": 0.9082283015795474,
"calib/step_q_w_n": 2047.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2206.0,
"completions/max_terminated_length": 2206.0,
"completions/mean_length": 726.26953125,
"completions/mean_terminated_length": 784.49365234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 441.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.03065774217247963,
"kl": 0.0743255615234375,
"learning_rate": 1.916666666666667e-06,
"loss": -0.2806,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.017435047775506973,
"mask/share_reasoning": 0.7895956635475159,
"mask/share_step_conf": 0.11875058710575104,
"num_tokens": 39487538.0,
"reward": 0.5388969779014587,
"reward_std": 0.1687847077846527,
"rewards/accuracy_reward_step": 0.37890625,
"rewards/final_brier_reward_step": 0.4151046872138977,
"rewards/format_reward_step": 0.90625,
"rewards/step_l1_reward": 0.40565797686576843,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.6716172695159912,
"adv/mean_abs_reasoning": 0.5227418541908264,
"adv/mean_abs_step_conf": 0.774185061454773,
"adv/ratio_final_to_reasoning": 1.2847971979508226,
"adv/ratio_step_to_reasoning": 1.4810083700934293,
"adv/std_final_conf": 0.8198915719985962,
"adv/std_reasoning": 0.7578502297401428,
"adv/std_step_conf": 0.9345174431800842,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.650655864197531,
"calib/avg_num_step_conf": 10.7421875,
"calib/ece": 0.30702066115702487,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.9380165289256198,
"calib/gap": 0.03149861111111096,
"calib/mean_conf": 0.9656983471074382,
"calib/mu_c": 0.976111111111111,
"calib/mu_w": 0.9446125000000001,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3016487603305786,
"calib/std_conf": 0.11223983653925576,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9218165304268847,
"calib/step_q_c_n": 1468.0,
"calib/step_q_gap": -0.000662408730681574,
"calib/step_q_w": 0.9224789391575663,
"calib/step_q_w_n": 1282.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2304.0,
"completions/max_terminated_length": 2304.0,
"completions/mean_length": 726.109375,
"completions/mean_terminated_length": 764.9547119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.1408,
"grad_norm": 0.033202171325683594,
"kl": 0.080047607421875,
"learning_rate": 1.888888888888889e-06,
"loss": -0.2414,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.018654868006706238,
"mask/share_reasoning": 0.8074687123298645,
"mask/share_step_conf": 0.12309515476226807,
"num_tokens": 39779014.0,
"reward": 0.7947405576705933,
"reward_std": 0.27415207028388977,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6544085741043091,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.6194474697113037,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7088272571563721,
"adv/mean_abs_reasoning": 0.5792692303657532,
"adv/mean_abs_step_conf": 0.7640947103500366,
"adv/ratio_final_to_reasoning": 1.2236577052587712,
"adv/ratio_step_to_reasoning": 1.3190666279090706,
"adv/std_final_conf": 0.8966022729873657,
"adv/std_reasoning": 0.8267735838890076,
"adv/std_step_conf": 0.935346245765686,
"calib/answer_extract_rate": 0.890625,
"calib/auroc": 0.5655533980582524,
"calib/avg_num_step_conf": 11.609375,
"calib/ece": 0.5164912280701756,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.9473684210526315,
"calib/gap": 0.02622135922330071,
"calib/mean_conf": 0.9682456140350878,
"calib/mu_c": 0.982621359223301,
"calib/mu_w": 0.9564000000000002,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.5164912280701756,
"calib/std_conf": 0.09939064884210455,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9174496644295302,
"calib/step_q_c_n": 894.0,
"calib/step_q_gap": 0.031120662183780956,
"calib/step_q_w": 0.8863290022457493,
"calib/step_q_w_n": 2078.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 2652.0,
"completions/max_terminated_length": 2652.0,
"completions/mean_length": 756.0390625,
"completions/mean_terminated_length": 841.5043334960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.03641675040125847,
"kl": 0.06818389892578125,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.2985,
"mask/has_final_conf_rate": 0.890625,
"mask/share_final_conf": 0.016083287075161934,
"mask/share_reasoning": 0.7779492139816284,
"mask/share_step_conf": 0.104404978454113,
"num_tokens": 40078904.0,
"reward": 0.5671447515487671,
"reward_std": 0.29637211561203003,
"rewards/accuracy_reward_step": 0.40234375,
"rewards/final_brier_reward_step": 0.43522578477859497,
"rewards/format_reward_step": 0.890625,
"rewards/step_l1_reward": 0.44047001004219055,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.6852869987487793,
"adv/mean_abs_reasoning": 0.5316742658615112,
"adv/mean_abs_step_conf": 0.7545932531356812,
"adv/ratio_final_to_reasoning": 1.2889226407043757,
"adv/ratio_step_to_reasoning": 1.419277368847179,
"adv/std_final_conf": 0.8771677017211914,
"adv/std_reasoning": 0.7929584980010986,
"adv/std_step_conf": 0.9351169466972351,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.635230179028133,
"calib/avg_num_step_conf": 10.265625,
"calib/ece": 0.38364583333333335,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.8666666666666667,
"calib/gap": 0.0455562659846549,
"calib/mean_conf": 0.9553125000000001,
"calib/mu_c": 0.9746739130434783,
"calib/mu_w": 0.9291176470588234,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3819791666666667,
"calib/std_conf": 0.11596967208606739,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9199074852817494,
"calib/step_q_c_n": 1189.0,
"calib/step_q_gap": 0.008364747269240635,
"calib/step_q_w": 0.9115427380125087,
"calib/step_q_w_n": 1439.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2314.0,
"completions/max_terminated_length": 2314.0,
"completions/mean_length": 821.1953125,
"completions/mean_terminated_length": 865.1275634765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 424.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.03140419349074364,
"kl": 0.06982421875,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.1888,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.016364730894565582,
"mask/share_reasoning": 0.8280090689659119,
"mask/share_step_conf": 0.10484500229358673,
"num_tokens": 40398082.0,
"reward": 0.7085082530975342,
"reward_std": 0.2661244571208954,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5810662508010864,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.5406375527381897,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.6562824249267578,
"adv/mean_abs_reasoning": 0.5146161317825317,
"adv/mean_abs_step_conf": 0.7555180788040161,
"adv/ratio_final_to_reasoning": 1.2752853717459676,
"adv/ratio_step_to_reasoning": 1.4681196957177503,
"adv/std_final_conf": 0.8674776554107666,
"adv/std_reasoning": 0.775693416595459,
"adv/std_step_conf": 0.9348829388618469,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.5915721844293272,
"calib/avg_num_step_conf": 11.0390625,
"calib/ece": 0.3475949367088608,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.869198312236287,
"calib/gap": 0.015238095238095273,
"calib/mean_conf": 0.9627848101265823,
"calib/mu_c": 0.9685714285714285,
"calib/mu_w": 0.9533333333333333,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.34506329113924056,
"calib/std_conf": 0.06449289449353177,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9222406908927772,
"calib/step_q_c_n": 1486.0,
"calib/step_q_gap": 0.02305412372859794,
"calib/step_q_w": 0.8991865671641792,
"calib/step_q_w_n": 1340.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2762.0,
"completions/max_terminated_length": 2762.0,
"completions/mean_length": 799.9375,
"completions/mean_terminated_length": 846.21484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.144,
"grad_norm": 0.03021004982292652,
"kl": 0.0778350830078125,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.2959,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.017042845487594604,
"mask/share_reasoning": 0.8079831004142761,
"mask/share_step_conf": 0.12028656899929047,
"num_tokens": 40708746.0,
"reward": 0.730875551700592,
"reward_std": 0.2815753221511841,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6018984317779541,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": 0.55907142162323,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.6929965019226074,
"adv/mean_abs_reasoning": 0.5509441494941711,
"adv/mean_abs_step_conf": 0.749183177947998,
"adv/ratio_final_to_reasoning": 1.2578343967512067,
"adv/ratio_step_to_reasoning": 1.3598169227059271,
"adv/std_final_conf": 0.8918748497962952,
"adv/std_reasoning": 0.8101029396057129,
"adv/std_step_conf": 0.934345006942749,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.632510885341074,
"calib/avg_num_step_conf": 12.73828125,
"calib/ece": 0.39983050847457635,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.8432203389830508,
"calib/gap": 0.05777648766328025,
"calib/mean_conf": 0.9422033898305086,
"calib/mu_c": 0.9681538461538463,
"calib/mu_w": 0.910377358490566,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3955932203389831,
"calib/std_conf": 0.15291038470692864,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9165577507598784,
"calib/step_q_c_n": 1316.0,
"calib/step_q_gap": 0.005629730194325666,
"calib/step_q_w": 0.9109280205655528,
"calib/step_q_w_n": 1945.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 2163.0,
"completions/max_terminated_length": 2163.0,
"completions/mean_length": 743.79296875,
"completions/mean_terminated_length": 800.0462646484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.025395670905709267,
"kl": 0.084625244140625,
"learning_rate": 1.777777777777778e-06,
"loss": -0.3088,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.01756712794303894,
"mask/share_reasoning": 0.7889890670776367,
"mask/share_step_conf": 0.12313126027584076,
"num_tokens": 41007645.0,
"reward": 0.6623439788818359,
"reward_std": 0.2670624256134033,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5570906400680542,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": 0.48244112730026245,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.6910654902458191,
"adv/mean_abs_reasoning": 0.492321252822876,
"adv/mean_abs_step_conf": 0.7506650686264038,
"adv/ratio_final_to_reasoning": 1.4036881127584513,
"adv/ratio_step_to_reasoning": 1.5247464218175302,
"adv/std_final_conf": 0.8978220820426941,
"adv/std_reasoning": 0.792906641960144,
"adv/std_step_conf": 0.9349708557128906,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.5634290294374817,
"calib/avg_num_step_conf": 10.28125,
"calib/ece": 0.3368708333333335,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.7708333333333334,
"calib/gap": 0.04376318857475936,
"calib/mean_conf": 0.9385375000000001,
"calib/mu_c": 0.955678082191781,
"calib/mu_w": 0.9119148936170216,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3335375000000002,
"calib/std_conf": 0.1254835258526659,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9210515765765765,
"calib/step_q_c_n": 1332.0,
"calib/step_q_gap": 0.038313115038115075,
"calib/step_q_w": 0.8827384615384615,
"calib/step_q_w_n": 1300.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 3046.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 769.59375,
"completions/mean_terminated_length": 804.1469116210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 462.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.0315549410879612,
"kl": 0.08020782470703125,
"learning_rate": 1.75e-06,
"loss": -0.1211,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.017642011865973473,
"mask/share_reasoning": 0.8182367086410522,
"mask/share_step_conf": 0.12115253508090973,
"num_tokens": 41311645.0,
"reward": 0.7388617992401123,
"reward_std": 0.27276355028152466,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6155734062194824,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.5621501207351685,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.7305846214294434,
"adv/mean_abs_reasoning": 0.46121594309806824,
"adv/mean_abs_step_conf": 0.7638969421386719,
"adv/ratio_final_to_reasoning": 1.5840402578496713,
"adv/ratio_step_to_reasoning": 1.6562674243380278,
"adv/std_final_conf": 0.8997833728790283,
"adv/std_reasoning": 0.7208998203277588,
"adv/std_step_conf": 0.9345141053199768,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.6519679895476074,
"calib/avg_num_step_conf": 10.9375,
"calib/ece": 0.2590212765957448,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.7574468085106383,
"calib/gap": 0.11035113506451089,
"calib/mean_conf": 0.9185957446808513,
"calib/mu_c": 0.9552229299363059,
"calib/mu_w": 0.844871794871795,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25476595744680863,
"calib/std_conf": 0.17728732054054358,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9116248303934872,
"calib/step_q_c_n": 1474.0,
"calib/step_q_gap": 0.0005124623693543162,
"calib/step_q_w": 0.9111123680241329,
"calib/step_q_w_n": 1326.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3016.0,
"completions/max_terminated_length": 3016.0,
"completions/mean_length": 795.16796875,
"completions/mean_terminated_length": 844.6597900390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.1472,
"grad_norm": 0.03231998533010483,
"kl": 0.08074188232421875,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.1688,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.01688062585890293,
"mask/share_reasoning": 0.8103748559951782,
"mask/share_step_conf": 0.11415077745914459,
"num_tokens": 41619544.0,
"reward": 0.7909604907035828,
"reward_std": 0.2596752345561981,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6728769540786743,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": 0.6027940511703491,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.7011387944221497,
"adv/mean_abs_reasoning": 0.37920230627059937,
"adv/mean_abs_step_conf": 0.7306464910507202,
"adv/ratio_final_to_reasoning": 1.84898346562749,
"adv/ratio_step_to_reasoning": 1.9267986480264963,
"adv/std_final_conf": 0.9124810099601746,
"adv/std_reasoning": 0.6817423105239868,
"adv/std_step_conf": 0.9337948560714722,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6916055663624121,
"calib/avg_num_step_conf": 11.04296875,
"calib/ece": 0.2850204081632655,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.726530612244898,
"calib/gap": 0.05144096962442002,
"calib/mean_conf": 0.9291020408163266,
"calib/mu_c": 0.946319018404908,
"calib/mu_w": 0.8948780487804879,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2744081632653063,
"calib/std_conf": 0.14888706100467758,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9204639175257732,
"calib/step_q_c_n": 1552.0,
"calib/step_q_gap": 0.0009188194865574761,
"calib/step_q_w": 0.9195450980392157,
"calib/step_q_w_n": 1275.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2544.0,
"completions/max_terminated_length": 2544.0,
"completions/mean_length": 732.1640625,
"completions/mean_terminated_length": 765.0366821289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.06230498477816582,
"kl": 0.0870208740234375,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.2228,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.018324337899684906,
"mask/share_reasoning": 0.8134405612945557,
"mask/share_step_conf": 0.12526635825634003,
"num_tokens": 41910074.0,
"reward": 0.8124518394470215,
"reward_std": 0.21989166736602783,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.678037166595459,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6281166076660156,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.744654655456543,
"adv/mean_abs_reasoning": 0.5463522672653198,
"adv/mean_abs_step_conf": 0.7565972805023193,
"adv/ratio_final_to_reasoning": 1.362957015230841,
"adv/ratio_step_to_reasoning": 1.384815852031415,
"adv/std_final_conf": 0.9136979579925537,
"adv/std_reasoning": 0.7755256295204163,
"adv/std_step_conf": 0.9340972304344177,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6650914634146343,
"calib/avg_num_step_conf": 10.73046875,
"calib/ece": 0.25704918032786894,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7213114754098361,
"calib/gap": 0.10781402439024401,
"calib/mean_conf": 0.9045901639344264,
"calib/mu_c": 0.9399390243902441,
"calib/mu_w": 0.8321250000000001,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24475409836065584,
"calib/std_conf": 0.2062850468623199,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9193794871794871,
"calib/step_q_c_n": 1560.0,
"calib/step_q_gap": 0.014998695267102957,
"calib/step_q_w": 0.9043807919123842,
"calib/step_q_w_n": 1187.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2580.0,
"completions/max_terminated_length": 2580.0,
"completions/mean_length": 767.859375,
"completions/mean_terminated_length": 802.3346557617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.043366990983486176,
"kl": 0.09185791015625,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.1546,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.017610831186175346,
"mask/share_reasoning": 0.817704439163208,
"mask/share_step_conf": 0.1217159777879715,
"num_tokens": 42211662.0,
"reward": 0.8117408752441406,
"reward_std": 0.25670474767684937,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6961570382118225,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6093559861183167,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.6669213771820068,
"adv/mean_abs_reasoning": 0.4145224988460541,
"adv/mean_abs_step_conf": 0.7681502103805542,
"adv/ratio_final_to_reasoning": 1.6088906610342735,
"adv/ratio_step_to_reasoning": 1.8530965448653025,
"adv/std_final_conf": 0.8626038432121277,
"adv/std_reasoning": 0.7015207409858704,
"adv/std_step_conf": 0.9342188239097595,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7096652022420844,
"calib/avg_num_step_conf": 10.5859375,
"calib/ece": 0.27102880658436224,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7654320987654321,
"calib/gap": 0.09559839418269955,
"calib/mean_conf": 0.917119341563786,
"calib/mu_c": 0.9493788819875778,
"calib/mu_w": 0.8537804878048783,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2627983539094651,
"calib/std_conf": 0.16448927741073446,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9215673141326189,
"calib/step_q_c_n": 1493.0,
"calib/step_q_gap": 0.013884487509775778,
"calib/step_q_w": 0.9076828266228432,
"calib/step_q_w_n": 1217.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2838.0,
"completions/max_terminated_length": 2838.0,
"completions/mean_length": 806.23046875,
"completions/mean_terminated_length": 835.6072998046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 399.0,
"epoch": 0.1504,
"grad_norm": 0.04461470618844032,
"kl": 0.07849884033203125,
"learning_rate": 1.638888888888889e-06,
"loss": -0.1327,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01757936179637909,
"mask/share_reasoning": 0.8285161256790161,
"mask/share_step_conf": 0.11874829232692719,
"num_tokens": 42525153.0,
"reward": 0.8140060901641846,
"reward_std": 0.2348444014787674,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6903749704360962,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6220120787620544,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.667151689529419,
"adv/mean_abs_reasoning": 0.44450634717941284,
"adv/mean_abs_step_conf": 0.7593700885772705,
"adv/ratio_final_to_reasoning": 1.5008822568289253,
"adv/ratio_step_to_reasoning": 1.708344759069934,
"adv/std_final_conf": 0.8530212044715881,
"adv/std_reasoning": 0.7206939458847046,
"adv/std_step_conf": 0.9329362511634827,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7053686714237173,
"calib/avg_num_step_conf": 9.57421875,
"calib/ece": 0.3773770491803279,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.7172131147540983,
"calib/gap": 0.09912062521236831,
"calib/mean_conf": 0.8978688524590165,
"calib/mu_c": 0.9421481481481483,
"calib/mu_w": 0.84302752293578,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3609836065573771,
"calib/std_conf": 0.22754473351660362,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.9254538021259198,
"calib/step_q_c_n": 1223.0,
"calib/step_q_gap": 0.007803150660121716,
"calib/step_q_w": 0.917650651465798,
"calib/step_q_w_n": 1228.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 816.53125,
"completions/mean_terminated_length": 836.1280517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 422.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.041575007140636444,
"kl": 0.075927734375,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.092,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.017675727605819702,
"mask/share_reasoning": 0.8376240730285645,
"mask/share_step_conf": 0.12126270681619644,
"num_tokens": 42839345.0,
"reward": 0.7154664993286133,
"reward_std": 0.21693003177642822,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.5958648324012756,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.5405368804931641,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.7077853679656982,
"adv/mean_abs_reasoning": 0.5404249429702759,
"adv/mean_abs_step_conf": 0.7752654552459717,
"adv/ratio_final_to_reasoning": 1.309683013658804,
"adv/ratio_step_to_reasoning": 1.4345478781659646,
"adv/std_final_conf": 0.8969067335128784,
"adv/std_reasoning": 0.7929291129112244,
"adv/std_step_conf": 0.9345833659172058,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.6854413702239789,
"calib/avg_num_step_conf": 12.53515625,
"calib/ece": 0.3470886075949368,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.679324894514768,
"calib/gap": 0.0835924462011417,
"calib/mean_conf": 0.908270042194093,
"calib/mu_c": 0.9431884057971014,
"calib/mu_w": 0.8595959595959597,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3365400843881858,
"calib/std_conf": 0.18677069943066818,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9213190406976746,
"calib/step_q_c_n": 1376.0,
"calib/step_q_gap": 0.020118276922442613,
"calib/step_q_w": 0.901200763775232,
"calib/step_q_w_n": 1833.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2813.0,
"completions/max_terminated_length": 2813.0,
"completions/mean_length": 789.1328125,
"completions/mean_terminated_length": 841.74169921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 405.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.04036405682563782,
"kl": 0.0802764892578125,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.2751,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.017021197825670242,
"mask/share_reasoning": 0.8005629777908325,
"mask/share_step_conf": 0.11991582810878754,
"num_tokens": 43148699.0,
"reward": 0.7185168266296387,
"reward_std": 0.2503293752670288,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6061657667160034,
"rewards/format_reward_step": 0.921875,
"rewards/step_l1_reward": 0.5386803150177002,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.6762067675590515,
"adv/mean_abs_reasoning": 0.5450261831283569,
"adv/mean_abs_step_conf": 0.741301953792572,
"adv/ratio_final_to_reasoning": 1.2406867568778814,
"adv/ratio_step_to_reasoning": 1.3601217276161408,
"adv/std_final_conf": 0.8893658518791199,
"adv/std_reasoning": 0.7930487990379333,
"adv/std_step_conf": 0.9350625872612,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6217501585288522,
"calib/avg_num_step_conf": 10.33984375,
"calib/ece": 0.23797520661157034,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.7066115702479339,
"calib/gap": 0.0992549143944198,
"calib/mean_conf": 0.8933471074380165,
"calib/mu_c": 0.9245180722891566,
"calib/mu_w": 0.8252631578947368,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.22268595041322323,
"calib/std_conf": 0.21259308941070215,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9215514905149051,
"calib/step_q_c_n": 1476.0,
"calib/step_q_gap": 0.015168484537108506,
"calib/step_q_w": 0.9063830059777966,
"calib/step_q_w_n": 1171.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2976.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 789.56640625,
"completions/mean_terminated_length": 821.66259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.1536,
"grad_norm": 0.03410850092768669,
"kl": 0.07788848876953125,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.205,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.017329085618257523,
"mask/share_reasoning": 0.8310571908950806,
"mask/share_step_conf": 0.1125512421131134,
"num_tokens": 43454956.0,
"reward": 0.8180810809135437,
"reward_std": 0.2903987169265747,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.6948046684265137,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.6226074695587158,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.7054914236068726,
"adv/mean_abs_reasoning": 0.6051229238510132,
"adv/mean_abs_step_conf": 0.769420862197876,
"adv/ratio_final_to_reasoning": 1.1658646463384867,
"adv/ratio_step_to_reasoning": 1.271511674522703,
"adv/std_final_conf": 0.8777279853820801,
"adv/std_reasoning": 0.8268728852272034,
"adv/std_step_conf": 0.9352013468742371,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.6287099983419002,
"calib/avg_num_step_conf": 11.625,
"calib/ece": 0.26510548523206773,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.6835443037974683,
"calib/gap": 0.0986395290996519,
"calib/mean_conf": 0.8466244725738398,
"calib/mu_c": 0.8774233128834358,
"calib/mu_w": 0.7787837837837839,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2119831223628694,
"calib/std_conf": 0.28987629820641186,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.9135570804741111,
"calib/step_q_c_n": 1603.0,
"calib/step_q_gap": 0.12349153058336093,
"calib/step_q_w": 0.7900655498907502,
"calib/step_q_w_n": 1373.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3053.0,
"completions/max_terminated_length": 3053.0,
"completions/mean_length": 744.83984375,
"completions/mean_terminated_length": 791.19921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.06647800654172897,
"kl": 0.07993316650390625,
"learning_rate": 1.527777777777778e-06,
"loss": -0.2408,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.017830245196819305,
"mask/share_reasoning": 0.7956852912902832,
"mask/share_step_conf": 0.1278906762599945,
"num_tokens": 43748339.0,
"reward": 0.7790793180465698,
"reward_std": 0.3233683109283447,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6591414213180542,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": 0.5880796909332275,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.7313030958175659,
"adv/mean_abs_reasoning": 0.5380452871322632,
"adv/mean_abs_step_conf": 0.7613229751586914,
"adv/ratio_final_to_reasoning": 1.3591850227242968,
"adv/ratio_step_to_reasoning": 1.4149793583668018,
"adv/std_final_conf": 0.8902589082717896,
"adv/std_reasoning": 0.7931119799613953,
"adv/std_step_conf": 0.9345507621765137,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.7030024247610897,
"calib/avg_num_step_conf": 11.421875,
"calib/ece": 0.3994810126582279,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.6582278481012658,
"calib/gap": 0.13828947368421063,
"calib/mean_conf": 0.8491856540084388,
"calib/mu_c": 0.9209561403508774,
"calib/mu_w": 0.7826666666666667,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3838270042194093,
"calib/std_conf": 0.2700605326051657,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8985568181818182,
"calib/step_q_c_n": 1144.0,
"calib/step_q_gap": -0.0048785750766087155,
"calib/step_q_w": 0.9034353932584269,
"calib/step_q_w_n": 1780.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05859375,
"completions/max_length": 3003.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 812.39453125,
"completions/mean_terminated_length": 862.9585571289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 459.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.0607595257461071,
"kl": 0.07318878173828125,
"learning_rate": 1.5e-06,
"loss": -0.2455,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.01656745746731758,
"mask/share_reasoning": 0.8082844614982605,
"mask/share_step_conf": 0.1165543720126152,
"num_tokens": 44063528.0,
"reward": 0.6538082957267761,
"reward_std": 0.3023184537887573,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.5655796527862549,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": 0.46781808137893677,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.6403455138206482,
"adv/mean_abs_reasoning": 0.39228206872940063,
"adv/mean_abs_step_conf": 0.7484415769577026,
"adv/ratio_final_to_reasoning": 1.6323598881150077,
"adv/ratio_step_to_reasoning": 1.9079168705872807,
"adv/std_final_conf": 0.8574442863464355,
"adv/std_reasoning": 0.6816247701644897,
"adv/std_step_conf": 0.9328452944755554,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.6356864383180172,
"calib/avg_num_step_conf": 10.14453125,
"calib/ece": 0.3536250000000001,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6625,
"calib/gap": 0.1537301587301586,
"calib/mean_conf": 0.8040416666666667,
"calib/mu_c": 0.877063492063492,
"calib/mu_w": 0.7233333333333334,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31633333333333336,
"calib/std_conf": 0.33256064348810466,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9106250000000001,
"calib/step_q_c_n": 1168.0,
"calib/step_q_gap": 0.00865089223233051,
"calib/step_q_w": 0.9019741077676696,
"calib/step_q_w_n": 1429.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3039.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 802.4921875,
"completions/mean_terminated_length": 845.423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.1568,
"grad_norm": 0.04731550067663193,
"kl": 0.0695037841796875,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.1847,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.016832541674375534,
"mask/share_reasoning": 0.8159008026123047,
"mask/share_step_conf": 0.11648540198802948,
"num_tokens": 44372646.0,
"reward": 0.6841420531272888,
"reward_std": 0.18067112565040588,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.5989097356796265,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.48343682289123535,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.6124477982521057,
"adv/mean_abs_reasoning": 0.44582149386405945,
"adv/mean_abs_step_conf": 0.7373963594436646,
"adv/ratio_final_to_reasoning": 1.3737511687555697,
"adv/ratio_step_to_reasoning": 1.6540170664551057,
"adv/std_final_conf": 0.8261498212814331,
"adv/std_reasoning": 0.7206892967224121,
"adv/std_step_conf": 0.9336621165275574,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6929432957393484,
"calib/avg_num_step_conf": 10.41015625,
"calib/ece": 0.2697540983606558,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6557377049180327,
"calib/gap": 0.18275689223057645,
"calib/mean_conf": 0.7918852459016394,
"calib/mu_c": 0.8488095238095239,
"calib/mu_w": 0.6660526315789475,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.18655737704918038,
"calib/std_conf": 0.35254538591020995,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.894113427345187,
"calib/step_q_c_n": 1631.0,
"calib/step_q_gap": -0.015572259308584857,
"calib/step_q_w": 0.9096856866537718,
"calib/step_q_w_n": 1034.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2299.0,
"completions/max_terminated_length": 2299.0,
"completions/mean_length": 743.19921875,
"completions/mean_terminated_length": 776.5673217773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.042488742619752884,
"kl": 0.0799407958984375,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.1819,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.01842150092124939,
"mask/share_reasoning": 0.8100440502166748,
"mask/share_step_conf": 0.1285656988620758,
"num_tokens": 44668017.0,
"reward": 0.8046440482139587,
"reward_std": 0.24246975779533386,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.6908816695213318,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.5973125696182251,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.6814405918121338,
"adv/mean_abs_reasoning": 0.5871527194976807,
"adv/mean_abs_step_conf": 0.7749460339546204,
"adv/ratio_final_to_reasoning": 1.1605849197038856,
"adv/ratio_step_to_reasoning": 1.3198372556591411,
"adv/std_final_conf": 0.8630624413490295,
"adv/std_reasoning": 0.8266708850860596,
"adv/std_step_conf": 0.9342211484909058,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7871368673255466,
"calib/avg_num_step_conf": 10.1875,
"calib/ece": 0.2008641975308642,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5843621399176955,
"calib/gap": 0.35755390835579504,
"calib/mean_conf": 0.7440740740740741,
"calib/mu_c": 0.8676729559748426,
"calib/mu_w": 0.5101190476190476,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14530864197530863,
"calib/std_conf": 0.37642495613262844,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9176170510132775,
"calib/step_q_c_n": 1431.0,
"calib/step_q_gap": 0.04122792611947956,
"calib/step_q_w": 0.8763891248937979,
"calib/step_q_w_n": 1177.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2881.0,
"completions/max_terminated_length": 2881.0,
"completions/mean_length": 805.97265625,
"completions/mean_terminated_length": 838.7357177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.051406074315309525,
"kl": 0.06976318359375,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0968,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.017330875620245934,
"mask/share_reasoning": 0.826300323009491,
"mask/share_step_conf": 0.11730631440877914,
"num_tokens": 44978802.0,
"reward": 0.8375177383422852,
"reward_std": 0.26769959926605225,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.745905876159668,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6150670647621155,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.6216943860054016,
"adv/mean_abs_reasoning": 0.5660361647605896,
"adv/mean_abs_step_conf": 0.7330152988433838,
"adv/ratio_final_to_reasoning": 1.0983297971223327,
"adv/ratio_step_to_reasoning": 1.294997289004351,
"adv/std_final_conf": 0.8433254361152649,
"adv/std_reasoning": 0.8099560737609863,
"adv/std_step_conf": 0.9347100257873535,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6438846371882085,
"calib/avg_num_step_conf": 10.71484375,
"calib/ece": 0.28720164609053506,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6707818930041153,
"calib/gap": 0.20551020408163279,
"calib/mean_conf": 0.7843209876543209,
"calib/mu_c": 0.8655102040816328,
"calib/mu_w": 0.66,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23329218106995891,
"calib/std_conf": 0.35839010726216575,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9128195488721805,
"calib/step_q_c_n": 1463.0,
"calib/step_q_gap": 0.035010955122180665,
"calib/step_q_w": 0.8778085937499999,
"calib/step_q_w_n": 1280.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2877.0,
"completions/max_terminated_length": 2877.0,
"completions/mean_length": 746.6171875,
"completions/mean_terminated_length": 780.1387329101562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.16,
"grad_norm": 0.036864928901195526,
"kl": 0.07648468017578125,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.2174,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01857100799679756,
"mask/share_reasoning": 0.8006365299224854,
"mask/share_step_conf": 0.1378237009048462,
"num_tokens": 45274896.0,
"reward": 0.7855761051177979,
"reward_std": 0.25697511434555054,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.65923672914505,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.6080092787742615,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.6929680705070496,
"adv/mean_abs_reasoning": 0.5574659705162048,
"adv/mean_abs_step_conf": 0.7786697149276733,
"adv/ratio_final_to_reasoning": 1.2430679308826185,
"adv/ratio_step_to_reasoning": 1.396802237465073,
"adv/std_final_conf": 0.8634377717971802,
"adv/std_reasoning": 0.7931473255157471,
"adv/std_step_conf": 0.9336228370666504,
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.6892233370494241,
"calib/avg_num_step_conf": 11.02734375,
"calib/ece": 0.31870689655172413,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.5172413793103449,
"calib/gap": 0.2716283909327387,
"calib/mean_conf": 0.6432758620689656,
"calib/mu_c": 0.7802608695652175,
"calib/mu_w": 0.5086324786324787,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.23314655172413795,
"calib/std_conf": 0.4318300588914439,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8998490393412627,
"calib/step_q_c_n": 1093.0,
"calib/step_q_gap": 0.00872476188461535,
"calib/step_q_w": 0.8911242774566474,
"calib/step_q_w_n": 1730.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 3000.0,
"completions/max_terminated_length": 3000.0,
"completions/mean_length": 817.57421875,
"completions/mean_terminated_length": 886.8601684570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 440.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.0430113859474659,
"kl": 0.0694580078125,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.2838,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.015757262706756592,
"mask/share_reasoning": 0.7966047525405884,
"mask/share_step_conf": 0.10951297730207443,
"num_tokens": 45591219.0,
"reward": 0.6842317581176758,
"reward_std": 0.28172385692596436,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6139644384384155,
"rewards/format_reward_step": 0.90234375,
"rewards/step_l1_reward": 0.48418641090393066,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.7391480207443237,
"adv/mean_abs_reasoning": 0.607087254524231,
"adv/mean_abs_step_conf": 0.7786715626716614,
"adv/ratio_final_to_reasoning": 1.2175317719749983,
"adv/ratio_step_to_reasoning": 1.2826353326786601,
"adv/std_final_conf": 0.8781725168228149,
"adv/std_reasoning": 0.8266691565513611,
"adv/std_step_conf": 0.9354438781738281,
"calib/answer_extract_rate": 0.90234375,
"calib/auroc": 0.626782463592233,
"calib/avg_num_step_conf": 11.671875,
"calib/ece": 0.3554112554112554,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.5151515151515151,
"calib/gap": 0.18460254854368935,
"calib/mean_conf": 0.6011255411255412,
"calib/mu_c": 0.6834375,
"calib/mu_w": 0.4988349514563107,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2012121212121212,
"calib/std_conf": 0.45127768281844893,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8895158665581773,
"calib/step_q_c_n": 1229.0,
"calib/step_q_gap": 0.13263752659228756,
"calib/step_q_w": 0.7568783399658897,
"calib/step_q_w_n": 1759.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2844.0,
"completions/max_terminated_length": 2844.0,
"completions/mean_length": 779.5859375,
"completions/mean_terminated_length": 835.0376586914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 497.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.05237310379743576,
"kl": 0.0787506103515625,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.2079,
"mask/has_final_conf_rate": 0.90234375,
"mask/share_final_conf": 0.016670234501361847,
"mask/share_reasoning": 0.7984967231750488,
"mask/share_step_conf": 0.11842679232358932,
"num_tokens": 45896185.0,
"reward": 0.6975486278533936,
"reward_std": 0.2750810980796814,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5759539008140564,
"rewards/format_reward_step": 0.90234375,
"rewards/step_l1_reward": 0.5386744737625122,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.7310658097267151,
"adv/mean_abs_reasoning": 0.5798449516296387,
"adv/mean_abs_step_conf": 0.7634512186050415,
"adv/ratio_final_to_reasoning": 1.260795334463246,
"adv/ratio_step_to_reasoning": 1.3166471769037262,
"adv/std_final_conf": 0.8923981785774231,
"adv/std_reasoning": 0.8266776204109192,
"adv/std_step_conf": 0.935492753982544,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.5558449289381651,
"calib/avg_num_step_conf": 10.109375,
"calib/ece": 0.3710833333333334,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.48333333333333334,
"calib/gap": 0.09073145323312748,
"calib/mean_conf": 0.6119166666666668,
"calib/mu_c": 0.6455629139072848,
"calib/mu_w": 0.5548314606741573,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1769166666666667,
"calib/std_conf": 0.43642925320784304,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9105449775112445,
"calib/step_q_c_n": 1334.0,
"calib/step_q_gap": 0.029432537319856955,
"calib/step_q_w": 0.8811124401913876,
"calib/step_q_w_n": 1254.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3020.0,
"completions/max_terminated_length": 3020.0,
"completions/mean_length": 757.01953125,
"completions/mean_terminated_length": 797.5184936523438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.1632,
"grad_norm": 0.1365126371383667,
"kl": 0.0780487060546875,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.1451,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.017622144892811775,
"mask/share_reasoning": 0.8127738237380981,
"mask/share_step_conf": 0.11882279813289642,
"num_tokens": 46197302.0,
"reward": 0.7272552847862244,
"reward_std": 0.2770336866378784,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.5796132683753967,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.5694285035133362,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.6581071615219116,
"adv/mean_abs_reasoning": 0.43368637561798096,
"adv/mean_abs_step_conf": 0.7545804977416992,
"adv/ratio_final_to_reasoning": 1.5174725297379759,
"adv/ratio_step_to_reasoning": 1.739922073102851,
"adv/std_final_conf": 0.8604373335838318,
"adv/std_reasoning": 0.7394781112670898,
"adv/std_step_conf": 0.9343244433403015,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.6834290516879115,
"calib/avg_num_step_conf": 9.9453125,
"calib/ece": 0.33485355648535564,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5941422594142259,
"calib/gap": 0.2627153662978008,
"calib/mean_conf": 0.6868200836820084,
"calib/mu_c": 0.8165289256198348,
"calib/mu_w": 0.553813559322034,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2576987447698745,
"calib/std_conf": 0.42267915794250943,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.905844465648855,
"calib/step_q_c_n": 1048.0,
"calib/step_q_gap": 0.007313090481965823,
"calib/step_q_w": 0.8985313751668892,
"calib/step_q_w_n": 1498.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3067.0,
"completions/max_terminated_length": 3067.0,
"completions/mean_length": 738.48828125,
"completions/mean_terminated_length": 787.7208862304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.056357041001319885,
"kl": 0.07154083251953125,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.2656,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.01771831139922142,
"mask/share_reasoning": 0.8029538989067078,
"mask/share_step_conf": 0.11682778596878052,
"num_tokens": 46490795.0,
"reward": 0.7125537395477295,
"reward_std": 0.2651887536048889,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.6242159605026245,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.5204226970672607,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.6670347452163696,
"adv/mean_abs_reasoning": 0.48000073432922363,
"adv/mean_abs_step_conf": 0.7483382225036621,
"adv/ratio_final_to_reasoning": 1.389653593235678,
"adv/ratio_step_to_reasoning": 1.559035578454742,
"adv/std_final_conf": 0.8591130375862122,
"adv/std_reasoning": 0.7395284175872803,
"adv/std_step_conf": 0.9336428642272949,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.5808728215468818,
"calib/avg_num_step_conf": 10.51953125,
"calib/ece": 0.359746835443038,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.5611814345991561,
"calib/gap": 0.12453406308512183,
"calib/mean_conf": 0.6914767932489452,
"calib/mu_c": 0.7471755725190841,
"calib/mu_w": 0.6226415094339622,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2492405063291139,
"calib/std_conf": 0.4124844102532277,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9006412337662338,
"calib/step_q_c_n": 1232.0,
"calib/step_q_gap": 0.031168270042756796,
"calib/step_q_w": 0.869472963723477,
"calib/step_q_w_n": 1461.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2565.0,
"completions/max_terminated_length": 2565.0,
"completions/mean_length": 732.3046875,
"completions/mean_terminated_length": 781.1250610351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 421.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.06447619944810867,
"kl": 0.07836151123046875,
"learning_rate": 1.25e-06,
"loss": -0.248,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.017479702830314636,
"mask/share_reasoning": 0.7976410984992981,
"mask/share_step_conf": 0.12237919867038727,
"num_tokens": 46785481.0,
"reward": 0.689497709274292,
"reward_std": 0.24059349298477173,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5785812139511108,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": 0.5129141807556152,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.6695674657821655,
"adv/mean_abs_reasoning": 0.4885939955711365,
"adv/mean_abs_step_conf": 0.783113956451416,
"adv/ratio_final_to_reasoning": 1.370396426995551,
"adv/ratio_step_to_reasoning": 1.6027907906154346,
"adv/std_final_conf": 0.8554741740226746,
"adv/std_reasoning": 0.739392876625061,
"adv/std_step_conf": 0.9342993497848511,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.661429347088367,
"calib/avg_num_step_conf": 9.6796875,
"calib/ece": 0.31632653061224497,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5183673469387755,
"calib/gap": 0.21964775349531696,
"calib/mean_conf": 0.6479183673469389,
"calib/mu_c": 0.7429496402877699,
"calib/mu_w": 0.5233018867924529,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.19844897959183683,
"calib/std_conf": 0.4164830318093278,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9048530579825258,
"calib/step_q_c_n": 1259.0,
"calib/step_q_gap": 0.06672809216354847,
"calib/step_q_w": 0.8381249658189773,
"calib/step_q_w_n": 1219.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2647.0,
"completions/max_terminated_length": 2647.0,
"completions/mean_length": 799.234375,
"completions/mean_terminated_length": 821.7027587890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 398.0,
"epoch": 0.1664,
"grad_norm": 0.061981022357940674,
"kl": 0.06970977783203125,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.1421,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01749337837100029,
"mask/share_reasoning": 0.8347588181495667,
"mask/share_step_conf": 0.12040403485298157,
"num_tokens": 47094845.0,
"reward": 0.7639741897583008,
"reward_std": 0.22314155101776123,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6530945301055908,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.5748538374900818,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.6397287845611572,
"adv/mean_abs_reasoning": 0.3884810209274292,
"adv/mean_abs_step_conf": 0.7641727924346924,
"adv/ratio_final_to_reasoning": 1.64674398516025,
"adv/ratio_step_to_reasoning": 1.967078830801994,
"adv/std_final_conf": 0.8433766961097717,
"adv/std_reasoning": 0.6818484663963318,
"adv/std_step_conf": 0.9321699142456055,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.6683219178082193,
"calib/avg_num_step_conf": 10.78125,
"calib/ece": 0.22656652360515023,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.6094420600858369,
"calib/gap": 0.24674058219178074,
"calib/mean_conf": 0.7502575107296138,
"calib/mu_c": 0.8275625,
"calib/mu_w": 0.5808219178082192,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.14506437768240343,
"calib/std_conf": 0.3626203192320913,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8907831715210355,
"calib/step_q_c_n": 1545.0,
"calib/step_q_gap": 0.07269263654161162,
"calib/step_q_w": 0.8180905349794239,
"calib/step_q_w_n": 1215.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06640625,
"completions/max_length": 2633.0,
"completions/max_terminated_length": 2633.0,
"completions/mean_length": 786.59765625,
"completions/mean_terminated_length": 842.548095703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 321.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.07800068706274033,
"kl": 0.07373809814453125,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.1742,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.016521422192454338,
"mask/share_reasoning": 0.799405038356781,
"mask/share_step_conf": 0.11766725033521652,
"num_tokens": 47399942.0,
"reward": 0.8016955852508545,
"reward_std": 0.24124254286289215,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6876152157783508,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": 0.608744740486145,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.5802692174911499,
"adv/mean_abs_reasoning": 0.46292001008987427,
"adv/mean_abs_step_conf": 0.7462335824966431,
"adv/ratio_final_to_reasoning": 1.2534978070584866,
"adv/ratio_step_to_reasoning": 1.6120140979685982,
"adv/std_final_conf": 0.8109095096588135,
"adv/std_reasoning": 0.7393543720245361,
"adv/std_step_conf": 0.9349581599235535,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6089843749999999,
"calib/avg_num_step_conf": 9.59765625,
"calib/ece": 0.28076612903225817,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7580645161290323,
"calib/gap": 0.11602272727272733,
"calib/mean_conf": 0.8763306451612904,
"calib/mu_c": 0.9175000000000001,
"calib/mu_w": 0.8014772727272728,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25596774193548394,
"calib/std_conf": 0.259120417323254,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9124654377880184,
"calib/step_q_c_n": 1519.0,
"calib/step_q_gap": 0.022699979365843448,
"calib/step_q_w": 0.8897654584221749,
"calib/step_q_w_n": 938.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2413.0,
"completions/max_terminated_length": 2413.0,
"completions/mean_length": 755.12890625,
"completions/mean_terminated_length": 776.357421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 428.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.044181808829307556,
"kl": 0.0697174072265625,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.116,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018855862319469452,
"mask/share_reasoning": 0.8255026340484619,
"mask/share_step_conf": 0.12829776108264923,
"num_tokens": 47698495.0,
"reward": 0.800797700881958,
"reward_std": 0.24212971329689026,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6816230416297913,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.6012223958969116,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.6155053377151489,
"adv/mean_abs_reasoning": 0.5257014036178589,
"adv/mean_abs_step_conf": 0.7635072469711304,
"adv/ratio_final_to_reasoning": 1.1708268866684823,
"adv/ratio_step_to_reasoning": 1.4523591561991274,
"adv/std_final_conf": 0.8442915081977844,
"adv/std_reasoning": 0.8098849058151245,
"adv/std_step_conf": 0.9338783621788025,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.6846465390279823,
"calib/avg_num_step_conf": 11.21875,
"calib/ece": 0.30071729957805904,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.6877637130801688,
"calib/gap": 0.1699845360824742,
"calib/mean_conf": 0.8309282700421942,
"calib/mu_c": 0.9005,
"calib/mu_w": 0.7305154639175258,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.270464135021097,
"calib/std_conf": 0.30175433451587225,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9182377049180328,
"calib/step_q_c_n": 1342.0,
"calib/step_q_gap": 0.07119522125790201,
"calib/step_q_w": 0.8470424836601308,
"calib/step_q_w_n": 1530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 739.734375,
"completions/mean_terminated_length": 789.050048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 403.0,
"epoch": 0.1696,
"grad_norm": 0.04411659762263298,
"kl": 0.07027435302734375,
"learning_rate": 1.138888888888889e-06,
"loss": -0.2479,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.017607979476451874,
"mask/share_reasoning": 0.7957726716995239,
"mask/share_step_conf": 0.12411928176879883,
"num_tokens": 47992651.0,
"reward": 0.7762739658355713,
"reward_std": 0.26523715257644653,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6403324007987976,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": 0.6169029474258423,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.6596049070358276,
"adv/mean_abs_reasoning": 0.5177563428878784,
"adv/mean_abs_step_conf": 0.7162469625473022,
"adv/ratio_final_to_reasoning": 1.2739677960423692,
"adv/ratio_step_to_reasoning": 1.3833668527406289,
"adv/std_final_conf": 0.8876446485519409,
"adv/std_reasoning": 0.8100550770759583,
"adv/std_step_conf": 0.934569239616394,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.6421108823305117,
"calib/avg_num_step_conf": 11.1015625,
"calib/ece": 0.3119230769230767,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.6324786324786325,
"calib/gap": 0.11231525966598055,
"calib/mean_conf": 0.810042735042735,
"calib/mu_c": 0.8546808510638297,
"calib/mu_w": 0.7423655913978492,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.25970085470085447,
"calib/std_conf": 0.2998054894544952,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9068384615384616,
"calib/step_q_c_n": 1300.0,
"calib/step_q_gap": 0.025583597725232,
"calib/step_q_w": 0.8812548638132296,
"calib/step_q_w_n": 1542.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 2666.0,
"completions/max_terminated_length": 2666.0,
"completions/mean_length": 776.13671875,
"completions/mean_terminated_length": 838.3585815429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 462.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.058579009026288986,
"kl": 0.067352294921875,
"learning_rate": 1.111111111111111e-06,
"loss": -0.3345,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.016484688967466354,
"mask/share_reasoning": 0.7973984479904175,
"mask/share_step_conf": 0.11189806461334229,
"num_tokens": 48296182.0,
"reward": 0.7409486770629883,
"reward_std": 0.27730506658554077,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6198972463607788,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": 0.5698126554489136,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.5626407265663147,
"adv/mean_abs_reasoning": 0.3099568486213684,
"adv/mean_abs_step_conf": 0.744856595993042,
"adv/ratio_final_to_reasoning": 1.8152227610676717,
"adv/ratio_step_to_reasoning": 2.403097719266499,
"adv/std_final_conf": 0.8191770315170288,
"adv/std_reasoning": 0.5961928367614746,
"adv/std_step_conf": 0.9331373572349548,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6836936936936937,
"calib/avg_num_step_conf": 10.14453125,
"calib/ece": 0.17024489795918363,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.6816326530612244,
"calib/gap": 0.23347747747747738,
"calib/mean_conf": 0.8276326530612245,
"calib/mu_c": 0.8848108108108108,
"calib/mu_w": 0.6513333333333334,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.12138775510204079,
"calib/std_conf": 0.30573048647844897,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8879506172839505,
"calib/step_q_c_n": 1755.0,
"calib/step_q_gap": 0.03697558755743435,
"calib/step_q_w": 0.8509750297265162,
"calib/step_q_w_n": 841.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1530.0,
"completions/max_terminated_length": 1530.0,
"completions/mean_length": 735.34375,
"completions/mean_terminated_length": 765.2357177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 337.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.09371700137853622,
"kl": 0.068603515625,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.1205,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.0185359176248312,
"mask/share_reasoning": 0.8156201243400574,
"mask/share_step_conf": 0.12678146362304688,
"num_tokens": 48588350.0,
"reward": 0.8863547444343567,
"reward_std": 0.1917186677455902,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.764299213886261,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.6732540130615234,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.5152875781059265,
"adv/mean_abs_reasoning": 0.4537314474582672,
"adv/mean_abs_step_conf": 0.7489913105964661,
"adv/ratio_final_to_reasoning": 1.1356664410026838,
"adv/ratio_step_to_reasoning": 1.6507370489574804,
"adv/std_final_conf": 0.7631077766418457,
"adv/std_reasoning": 0.7206796407699585,
"adv/std_step_conf": 0.93306964635849,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7224846480869155,
"calib/avg_num_step_conf": 9.3828125,
"calib/ece": 0.1806072874493927,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6923076923076923,
"calib/gap": 0.2122295701464335,
"calib/mean_conf": 0.8627935222672064,
"calib/mu_c": 0.9255172413793101,
"calib/mu_w": 0.7132876712328766,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16947368421052628,
"calib/std_conf": 0.24479002379082498,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9073752869964518,
"calib/step_q_c_n": 1597.0,
"calib/step_q_gap": 0.030490814946762423,
"calib/step_q_w": 0.8768844720496893,
"calib/step_q_w_n": 805.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 753.15234375,
"completions/mean_terminated_length": 777.4475708007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 405.0,
"epoch": 0.1728,
"grad_norm": 0.044426869601011276,
"kl": 0.06839752197265625,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.1427,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.01875723898410797,
"mask/share_reasoning": 0.8271980881690979,
"mask/share_step_conf": 0.12279466539621353,
"num_tokens": 48885301.0,
"reward": 0.8925344944000244,
"reward_std": 0.21106109023094177,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7672237753868103,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.6889388561248779,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.5857610702514648,
"adv/mean_abs_reasoning": 0.44100522994995117,
"adv/mean_abs_step_conf": 0.7768477201461792,
"adv/ratio_final_to_reasoning": 1.3282406431278417,
"adv/ratio_step_to_reasoning": 1.761538565504863,
"adv/std_final_conf": 0.7958298325538635,
"adv/std_reasoning": 0.7014980316162109,
"adv/std_step_conf": 0.9332451224327087,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7322478991596638,
"calib/avg_num_step_conf": 9.52734375,
"calib/ece": 0.283734439834025,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6556016597510373,
"calib/gap": 0.22323109243697492,
"calib/mean_conf": 0.8408298755186724,
"calib/mu_c": 0.9380882352941178,
"calib/mu_w": 0.7148571428571429,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.280124481327801,
"calib/std_conf": 0.260970372596227,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9138898233809927,
"calib/step_q_c_n": 1189.0,
"calib/step_q_gap": 0.05811382338099247,
"calib/step_q_w": 0.8557760000000002,
"calib/step_q_w_n": 1250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2539.0,
"completions/max_terminated_length": 2539.0,
"completions/mean_length": 778.171875,
"completions/mean_terminated_length": 819.8024291992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.046456970274448395,
"kl": 0.062042236328125,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.1249,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.018092192709445953,
"mask/share_reasoning": 0.8152344822883606,
"mask/share_step_conf": 0.11589207500219345,
"num_tokens": 49189345.0,
"reward": 0.7753407955169678,
"reward_std": 0.22338135540485382,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6771906018257141,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.5789597034454346,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.6575444936752319,
"adv/mean_abs_reasoning": 0.5587068200111389,
"adv/mean_abs_step_conf": 0.7896183729171753,
"adv/ratio_final_to_reasoning": 1.1769043622236839,
"adv/ratio_step_to_reasoning": 1.4132964636111525,
"adv/std_final_conf": 0.8603196740150452,
"adv/std_reasoning": 0.826644241809845,
"adv/std_step_conf": 0.9346628189086914,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.6551515151515153,
"calib/avg_num_step_conf": 10.12109375,
"calib/ece": 0.2635294117647058,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.7142857142857143,
"calib/gap": 0.12994545454545448,
"calib/mean_conf": 0.8623529411764707,
"calib/mu_c": 0.9103999999999999,
"calib/mu_w": 0.7804545454545454,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2478151260504201,
"calib/std_conf": 0.24918929351790883,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.887434456928839,
"calib/step_q_c_n": 1335.0,
"calib/step_q_gap": 0.04784050788425287,
"calib/step_q_w": 0.8395939490445862,
"calib/step_q_w_n": 1256.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2283.0,
"completions/max_terminated_length": 2283.0,
"completions/mean_length": 788.15625,
"completions/mean_terminated_length": 840.7000732421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.049657903611660004,
"kl": 0.06340789794921875,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.2315,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.01661180891096592,
"mask/share_reasoning": 0.8164951801300049,
"mask/share_step_conf": 0.10439302027225494,
"num_tokens": 49497249.0,
"reward": 0.7687405347824097,
"reward_std": 0.2964133322238922,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6576253771781921,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": 0.5775119662284851,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.5841305255889893,
"adv/mean_abs_reasoning": 0.4875459671020508,
"adv/mean_abs_step_conf": 0.7689133882522583,
"adv/ratio_final_to_reasoning": 1.1981034917815696,
"adv/ratio_step_to_reasoning": 1.5771095243031987,
"adv/std_final_conf": 0.7935722470283508,
"adv/std_reasoning": 0.7394719123840332,
"adv/std_step_conf": 0.9343661069869995,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6993981737686773,
"calib/avg_num_step_conf": 9.42578125,
"calib/ece": 0.32152263374485596,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.6790123456790124,
"calib/gap": 0.15990453790813497,
"calib/mean_conf": 0.841275720164609,
"calib/mu_c": 0.9097122302158274,
"calib/mu_w": 0.7498076923076924,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2953909465020576,
"calib/std_conf": 0.27270859443063616,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9023426517571884,
"calib/step_q_c_n": 1252.0,
"calib/step_q_gap": 0.02143825899233054,
"calib/step_q_w": 0.8809043927648579,
"calib/step_q_w_n": 1161.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 1751.0,
"completions/max_terminated_length": 1751.0,
"completions/mean_length": 796.234375,
"completions/mean_terminated_length": 828.6016235351562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.176,
"grad_norm": 0.03774955868721008,
"kl": 0.06166839599609375,
"learning_rate": 9.722222222222224e-07,
"loss": -0.1753,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.017093051224946976,
"mask/share_reasoning": 0.8328508734703064,
"mask/share_step_conf": 0.11099356412887573,
"num_tokens": 49806661.0,
"reward": 0.7528131604194641,
"reward_std": 0.2334972769021988,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6517425775527954,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.555446207523346,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.5557419657707214,
"adv/mean_abs_reasoning": 0.39997929334640503,
"adv/mean_abs_step_conf": 0.7824414968490601,
"adv/ratio_final_to_reasoning": 1.3894268403775016,
"adv/ratio_step_to_reasoning": 1.9562050082713176,
"adv/std_final_conf": 0.791394829750061,
"adv/std_reasoning": 0.6817243695259094,
"adv/std_step_conf": 0.9336012005805969,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.8174369747899161,
"calib/avg_num_step_conf": 9.38671875,
"calib/ece": 0.21129166666666663,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.7291666666666666,
"calib/gap": 0.2625630252100841,
"calib/mean_conf": 0.8571250000000001,
"calib/mu_c": 0.9337058823529413,
"calib/mu_w": 0.6711428571428572,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1800416666666666,
"calib/std_conf": 0.26829023657536755,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8895049309664695,
"calib/step_q_c_n": 1521.0,
"calib/step_q_gap": 0.019992459311140798,
"calib/step_q_w": 0.8695124716553287,
"calib/step_q_w_n": 882.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2600.0,
"completions/max_terminated_length": 2600.0,
"completions/mean_length": 753.671875,
"completions/mean_terminated_length": 797.272705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 414.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.052816737443208694,
"kl": 0.0668182373046875,
"learning_rate": 9.444444444444445e-07,
"loss": -0.168,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.017924867570400238,
"mask/share_reasoning": 0.81169593334198,
"mask/share_step_conf": 0.11569161713123322,
"num_tokens": 50105785.0,
"reward": 0.8533405065536499,
"reward_std": 0.2136755734682083,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.757287859916687,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.6290807127952576,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.5000762343406677,
"adv/mean_abs_reasoning": 0.4703137278556824,
"adv/mean_abs_step_conf": 0.762180745601654,
"adv/ratio_final_to_reasoning": 1.0632822406028473,
"adv/ratio_step_to_reasoning": 1.620579414249061,
"adv/std_final_conf": 0.741891086101532,
"adv/std_reasoning": 0.7393703460693359,
"adv/std_step_conf": 0.933522641658783,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5653275296132438,
"calib/avg_num_step_conf": 9.90625,
"calib/ece": 0.3084081632653061,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8122448979591836,
"calib/gap": 0.062127872127872186,
"calib/mean_conf": 0.921469387755102,
"calib/mu_c": 0.9445454545454546,
"calib/mu_w": 0.8824175824175824,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30065306122448976,
"calib/std_conf": 0.17472627012288028,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.898509129213483,
"calib/step_q_c_n": 1424.0,
"calib/step_q_gap": 0.03991110763074923,
"calib/step_q_w": 0.8585980215827338,
"calib/step_q_w_n": 1112.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2898.0,
"completions/max_terminated_length": 2898.0,
"completions/mean_length": 779.24609375,
"completions/mean_terminated_length": 807.6397094726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.04024127870798111,
"kl": 0.06053924560546875,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0904,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.017891276627779007,
"mask/share_reasoning": 0.8272270560264587,
"mask/share_step_conf": 0.11972543597221375,
"num_tokens": 50410880.0,
"reward": 0.7745072841644287,
"reward_std": 0.21950817108154297,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6461308598518372,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.5919462442398071,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.6446714997291565,
"adv/mean_abs_reasoning": 0.5393311977386475,
"adv/mean_abs_step_conf": 0.7846331000328064,
"adv/ratio_final_to_reasoning": 1.1953165372820793,
"adv/ratio_step_to_reasoning": 1.4548260944715994,
"adv/std_final_conf": 0.8436398506164551,
"adv/std_reasoning": 0.7755438685417175,
"adv/std_step_conf": 0.9343622326850891,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7345307154544102,
"calib/avg_num_step_conf": 9.32421875,
"calib/ece": 0.2300404858299595,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.7206477732793523,
"calib/gap": 0.21213520749665327,
"calib/mean_conf": 0.8814574898785424,
"calib/mu_c": 0.9510240963855422,
"calib/mu_w": 0.7388888888888889,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2197165991902834,
"calib/std_conf": 0.23510765303744627,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9018106796116505,
"calib/step_q_c_n": 1442.0,
"calib/step_q_gap": 0.10523925104022192,
"calib/step_q_w": 0.7965714285714286,
"calib/step_q_w_n": 945.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2843.0,
"completions/max_terminated_length": 2843.0,
"completions/mean_length": 818.5703125,
"completions/mean_terminated_length": 838.216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 395.0,
"epoch": 0.1792,
"grad_norm": 0.04501558840274811,
"kl": 0.0583648681640625,
"learning_rate": 8.88888888888889e-07,
"loss": -0.1051,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.017265118658542633,
"mask/share_reasoning": 0.8477543592453003,
"mask/share_step_conf": 0.11154300719499588,
"num_tokens": 50725106.0,
"reward": 0.8554896712303162,
"reward_std": 0.2765381336212158,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.7467812299728394,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.6415417790412903,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.5372599959373474,
"adv/mean_abs_reasoning": 0.444337397813797,
"adv/mean_abs_step_conf": 0.762183427810669,
"adv/ratio_final_to_reasoning": 1.2091262148555193,
"adv/ratio_step_to_reasoning": 1.7153258572443362,
"adv/std_final_conf": 0.7773349285125732,
"adv/std_reasoning": 0.7015244364738464,
"adv/std_step_conf": 0.9335224628448486,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6206280491994778,
"calib/avg_num_step_conf": 9.359375,
"calib/ece": 0.3318292682926829,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8089430894308943,
"calib/gap": 0.10794681508967241,
"calib/mean_conf": 0.9066260162601626,
"calib/mu_c": 0.9500680272108846,
"calib/mu_w": 0.8421212121212122,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.32044715447154465,
"calib/std_conf": 0.20670173706004993,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9029939668174963,
"calib/step_q_c_n": 1326.0,
"calib/step_q_gap": 0.01505938737824386,
"calib/step_q_w": 0.8879345794392525,
"calib/step_q_w_n": 1070.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2868.0,
"completions/max_terminated_length": 2868.0,
"completions/mean_length": 797.59765625,
"completions/mean_terminated_length": 820.02001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 426.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.0497799851000309,
"kl": 0.06368255615234375,
"learning_rate": 8.611111111111112e-07,
"loss": -0.117,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.017907004803419113,
"mask/share_reasoning": 0.840368390083313,
"mask/share_step_conf": 0.11438088119029999,
"num_tokens": 51033475.0,
"reward": 0.7868397831916809,
"reward_std": 0.2144777625799179,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6468933820724487,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.6197549104690552,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.5735569000244141,
"adv/mean_abs_reasoning": 0.4856990575790405,
"adv/mean_abs_step_conf": 0.7721710205078125,
"adv/ratio_final_to_reasoning": 1.180889464524184,
"adv/ratio_step_to_reasoning": 1.5898137096594074,
"adv/std_final_conf": 0.7941933274269104,
"adv/std_reasoning": 0.7576899528503418,
"adv/std_step_conf": 0.9353705644607544,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.619972115719763,
"calib/avg_num_step_conf": 9.48828125,
"calib/ece": 0.3219918699186991,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8577235772357723,
"calib/gap": 0.09807110491460413,
"calib/mean_conf": 0.9309349593495936,
"calib/mu_c": 0.9688079470198673,
"calib/mu_w": 0.8707368421052631,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3195528455284552,
"calib/std_conf": 0.17873181253029413,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9172869101978691,
"calib/step_q_c_n": 1314.0,
"calib/step_q_gap": 0.03548421961490944,
"calib/step_q_w": 0.8818026905829597,
"calib/step_q_w_n": 1115.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2862.0,
"completions/max_terminated_length": 2862.0,
"completions/mean_length": 781.828125,
"completions/mean_terminated_length": 807.04833984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 430.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.03491868078708649,
"kl": 0.06319427490234375,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0732,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.017963500693440437,
"mask/share_reasoning": 0.8349390029907227,
"mask/share_step_conf": 0.11584748327732086,
"num_tokens": 51337775.0,
"reward": 0.7697086930274963,
"reward_std": 0.28596359491348267,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6465945243835449,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.5834478139877319,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.5924351811408997,
"adv/mean_abs_reasoning": 0.49389344453811646,
"adv/mean_abs_step_conf": 0.75652015209198,
"adv/ratio_final_to_reasoning": 1.1995202359791155,
"adv/ratio_step_to_reasoning": 1.531747708859487,
"adv/std_final_conf": 0.812272846698761,
"adv/std_reasoning": 0.7578060626983643,
"adv/std_step_conf": 0.9342533349990845,
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.6203609000584454,
"calib/avg_num_step_conf": 10.51953125,
"calib/ece": 0.42354700854700866,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.7735042735042735,
"calib/gap": 0.07948421975452946,
"calib/mean_conf": 0.8855128205128205,
"calib/mu_c": 0.9249152542372879,
"calib/mu_w": 0.8454310344827585,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4023931623931625,
"calib/std_conf": 0.2280443469911254,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9110245901639344,
"calib/step_q_c_n": 1098.0,
"calib/step_q_gap": 0.10283650238963982,
"calib/step_q_w": 0.8081880877742946,
"calib/step_q_w_n": 1595.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2781.0,
"completions/max_terminated_length": 2781.0,
"completions/mean_length": 750.37890625,
"completions/mean_terminated_length": 800.4042358398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 488.0,
"epoch": 0.1824,
"grad_norm": 0.05241640657186508,
"kl": 0.05883026123046875,
"learning_rate": 8.055555555555557e-07,
"loss": -0.2661,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.017459597438573837,
"mask/share_reasoning": 0.8048005104064941,
"mask/share_step_conf": 0.11523989588022232,
"num_tokens": 51636768.0,
"reward": 0.6549869775772095,
"reward_std": 0.2620229721069336,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.5414218902587891,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l1_reward": 0.49433326721191406,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.5368980169296265,
"adv/mean_abs_reasoning": 0.40605199337005615,
"adv/mean_abs_step_conf": 0.7671827077865601,
"adv/ratio_final_to_reasoning": 1.3222395794036248,
"adv/ratio_step_to_reasoning": 1.8893706232526406,
"adv/std_final_conf": 0.7779921293258667,
"adv/std_reasoning": 0.6816897392272949,
"adv/std_step_conf": 0.9344073534011841,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6197683397683398,
"calib/avg_num_step_conf": 8.84375,
"calib/ece": 0.2395943775100402,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8192771084337349,
"calib/gap": 0.07640432432432442,
"calib/mean_conf": 0.9343734939759037,
"calib/mu_c": 0.9570800000000002,
"calib/mu_w": 0.8806756756756757,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2355783132530121,
"calib/std_conf": 0.14996382783483647,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8967925973197193,
"calib/step_q_c_n": 1567.0,
"calib/step_q_gap": 0.0013837020542958323,
"calib/step_q_w": 0.8954088952654234,
"calib/step_q_w_n": 697.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2237.0,
"completions/max_terminated_length": 2237.0,
"completions/mean_length": 747.42578125,
"completions/mean_terminated_length": 756.28857421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 438.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.027499012649059296,
"kl": 0.06461334228515625,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0052,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019513994455337524,
"mask/share_reasoning": 0.8444157242774963,
"mask/share_step_conf": 0.12435153126716614,
"num_tokens": 51931461.0,
"reward": 0.8563653826713562,
"reward_std": 0.21522122621536255,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7265148162841797,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.6549659371376038,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.5885812044143677,
"adv/mean_abs_reasoning": 0.4769488275051117,
"adv/mean_abs_step_conf": 0.7240892648696899,
"adv/ratio_final_to_reasoning": 1.234055249686214,
"adv/ratio_step_to_reasoning": 1.5181697136301893,
"adv/std_final_conf": 0.8601162433624268,
"adv/std_reasoning": 0.775355339050293,
"adv/std_step_conf": 0.9352255463600159,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5762152777777778,
"calib/avg_num_step_conf": 9.84375,
"calib/ece": 0.32524390243902435,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.8211382113821138,
"calib/gap": 0.06849999999999978,
"calib/mean_conf": 0.9309349593495935,
"calib/mu_c": 0.9576666666666666,
"calib/mu_w": 0.8891666666666668,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3232113821138211,
"calib/std_conf": 0.1544474146750119,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8994468704512372,
"calib/step_q_c_n": 1374.0,
"calib/step_q_gap": 0.0021795638776478388,
"calib/step_q_w": 0.8972673065735893,
"calib/step_q_w_n": 1146.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2902.0,
"completions/max_terminated_length": 2902.0,
"completions/mean_length": 806.3203125,
"completions/mean_terminated_length": 825.6720581054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 360.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.07306650280952454,
"kl": 0.057220458984375,
"learning_rate": 7.5e-07,
"loss": -0.0718,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.01831592619419098,
"mask/share_reasoning": 0.8370583057403564,
"mask/share_step_conf": 0.12118831276893616,
"num_tokens": 52241039.0,
"reward": 0.7683874368667603,
"reward_std": 0.25729119777679443,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6415566205978394,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.5858431458473206,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.6525813937187195,
"adv/mean_abs_reasoning": 0.581325888633728,
"adv/mean_abs_step_conf": 0.7921609282493591,
"adv/ratio_final_to_reasoning": 1.1225741128654376,
"adv/ratio_step_to_reasoning": 1.3626795980326114,
"adv/std_final_conf": 0.843597412109375,
"adv/std_reasoning": 0.792957603931427,
"adv/std_step_conf": 0.9348820447921753,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.622702205882353,
"calib/avg_num_step_conf": 9.7421875,
"calib/ece": 0.32099999999999973,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.7291666666666666,
"calib/gap": 0.13005656108597285,
"calib/mean_conf": 0.8765833333333334,
"calib/mu_c": 0.9329411764705883,
"calib/mu_w": 0.8028846153846154,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31545833333333306,
"calib/std_conf": 0.23909898868227966,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8955775316455697,
"calib/step_q_c_n": 1264.0,
"calib/step_q_gap": 0.04651493001955331,
"calib/step_q_w": 0.8490626016260164,
"calib/step_q_w_n": 1230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2587.0,
"completions/max_terminated_length": 2587.0,
"completions/mean_length": 850.3828125,
"completions/mean_terminated_length": 884.951171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 476.0,
"epoch": 0.1856,
"grad_norm": 0.03435070812702179,
"kl": 0.05667877197265625,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0622,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.016576357185840607,
"mask/share_reasoning": 0.8326107263565063,
"mask/share_step_conf": 0.11175040900707245,
"num_tokens": 52562969.0,
"reward": 0.7409429550170898,
"reward_std": 0.2846534252166748,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6235312223434448,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.5638233423233032,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.5845508575439453,
"adv/mean_abs_reasoning": 0.4365823566913605,
"adv/mean_abs_step_conf": 0.7559595704078674,
"adv/ratio_final_to_reasoning": 1.3389246005586761,
"adv/ratio_step_to_reasoning": 1.7315394422644728,
"adv/std_final_conf": 0.8250952959060669,
"adv/std_reasoning": 0.7392560243606567,
"adv/std_step_conf": 0.9347420930862427,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6401459854014598,
"calib/avg_num_step_conf": 9.359375,
"calib/ece": 0.4161983471074381,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6776859504132231,
"calib/gap": 0.12556969064998258,
"calib/mean_conf": 0.82900826446281,
"calib/mu_c": 0.9000952380952382,
"calib/mu_w": 0.7745255474452556,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40566115702479344,
"calib/std_conf": 0.2803315774580673,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.908609062170706,
"calib/step_q_c_n": 949.0,
"calib/step_q_gap": 0.06408936624810757,
"calib/step_q_w": 0.8445196959225985,
"calib/step_q_w_n": 1447.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 3011.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 823.83203125,
"completions/mean_terminated_length": 857.3211059570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.05315734073519707,
"kl": 0.0612945556640625,
"learning_rate": 6.944444444444446e-07,
"loss": -0.1763,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.017256654798984528,
"mask/share_reasoning": 0.8306593894958496,
"mask/share_step_conf": 0.11302149295806885,
"num_tokens": 52879694.0,
"reward": 0.6704280972480774,
"reward_std": 0.21455532312393188,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.5495570302009583,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.5202054381370544,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.6492704153060913,
"adv/mean_abs_reasoning": 0.479910671710968,
"adv/mean_abs_step_conf": 0.7665454745292664,
"adv/ratio_final_to_reasoning": 1.352898473774974,
"adv/ratio_step_to_reasoning": 1.5972669909514485,
"adv/std_final_conf": 0.8582326173782349,
"adv/std_reasoning": 0.7394721508026123,
"adv/std_step_conf": 0.9343740940093994,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6369176598049837,
"calib/avg_num_step_conf": 9.859375,
"calib/ece": 0.29304878048780497,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6463414634146342,
"calib/gap": 0.1483613217768146,
"calib/mean_conf": 0.8304471544715448,
"calib/mu_c": 0.893169014084507,
"calib/mu_w": 0.7448076923076924,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27313008130081307,
"calib/std_conf": 0.2756105625093126,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8961136536994663,
"calib/step_q_c_n": 1311.0,
"calib/step_q_gap": 0.04517883521761801,
"calib/step_q_w": 0.8509348184818483,
"calib/step_q_w_n": 1212.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2466.0,
"completions/max_terminated_length": 2466.0,
"completions/mean_length": 775.703125,
"completions/mean_terminated_length": 800.7257690429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.041484564542770386,
"kl": 0.0646820068359375,
"learning_rate": 6.666666666666667e-07,
"loss": -0.1311,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.018372666090726852,
"mask/share_reasoning": 0.828335702419281,
"mask/share_step_conf": 0.12204164266586304,
"num_tokens": 53182338.0,
"reward": 0.7937588691711426,
"reward_std": 0.254478394985199,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6575058698654175,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6276681423187256,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.5691676735877991,
"adv/mean_abs_reasoning": 0.37612125277519226,
"adv/mean_abs_step_conf": 0.7652287483215332,
"adv/ratio_final_to_reasoning": 1.5132558168096677,
"adv/ratio_step_to_reasoning": 2.034526745498507,
"adv/std_final_conf": 0.8127015829086304,
"adv/std_reasoning": 0.6817793846130371,
"adv/std_step_conf": 0.9344162344932556,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.7425963197239793,
"calib/avg_num_step_conf": 10.34765625,
"calib/ece": 0.2562396694214875,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6859504132231405,
"calib/gap": 0.20173519263944817,
"calib/mean_conf": 0.8595454545454545,
"calib/mu_c": 0.9379054054054055,
"calib/mu_w": 0.7361702127659573,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2521074380165288,
"calib/std_conf": 0.23704316655393667,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.905525998492841,
"calib/step_q_c_n": 1327.0,
"calib/step_q_gap": 0.08445262481659288,
"calib/step_q_w": 0.8210733736762481,
"calib/step_q_w_n": 1322.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2601.0,
"completions/max_terminated_length": 2601.0,
"completions/mean_length": 766.640625,
"completions/mean_terminated_length": 797.8048706054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.1888,
"grad_norm": 0.05364028364419937,
"kl": 0.06444549560546875,
"learning_rate": 6.388888888888889e-07,
"loss": -0.2257,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.0180435199290514,
"mask/share_reasoning": 0.8248641490936279,
"mask/share_step_conf": 0.1180298775434494,
"num_tokens": 53482430.0,
"reward": 0.7916520833969116,
"reward_std": 0.22819365561008453,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7000319957733154,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.5793658494949341,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.6649892330169678,
"adv/mean_abs_reasoning": 0.45371317863464355,
"adv/mean_abs_step_conf": 0.7933188676834106,
"adv/ratio_final_to_reasoning": 1.4656599462641047,
"adv/ratio_step_to_reasoning": 1.7485030301979336,
"adv/std_final_conf": 0.8735648393630981,
"adv/std_reasoning": 0.7208279371261597,
"adv/std_step_conf": 0.9342332482337952,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.73158312870143,
"calib/avg_num_step_conf": 9.875,
"calib/ece": 0.20234817813765182,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6275303643724697,
"calib/gap": 0.21397804420049116,
"calib/mean_conf": 0.8513360323886638,
"calib/mu_c": 0.9258385093167703,
"calib/mu_w": 0.7118604651162791,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20093117408906885,
"calib/std_conf": 0.21808570084302475,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9033420707732635,
"calib/step_q_c_n": 1526.0,
"calib/step_q_gap": 0.036481791332145685,
"calib/step_q_w": 0.8668602794411178,
"calib/step_q_w_n": 1002.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2889.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 775.62109375,
"completions/mean_terminated_length": 797.4256591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 418.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.05140404403209686,
"kl": 0.067779541015625,
"learning_rate": 6.111111111111112e-07,
"loss": -0.0966,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.018296916037797928,
"mask/share_reasoning": 0.8330808877944946,
"mask/share_step_conf": 0.12127845734357834,
"num_tokens": 53787061.0,
"reward": 0.8549030423164368,
"reward_std": 0.24177849292755127,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.755286693572998,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.6357693076133728,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.6497317552566528,
"adv/mean_abs_reasoning": 0.45641210675239563,
"adv/mean_abs_step_conf": 0.7805640697479248,
"adv/ratio_final_to_reasoning": 1.4235638048250403,
"adv/ratio_step_to_reasoning": 1.7102177137719579,
"adv/std_final_conf": 0.8614363670349121,
"adv/std_reasoning": 0.7207245230674744,
"adv/std_step_conf": 0.9349228143692017,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.6598185117967332,
"calib/avg_num_step_conf": 9.15234375,
"calib/ece": 0.2562916666666666,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6333333333333333,
"calib/gap": 0.17090744101633404,
"calib/mean_conf": 0.8546250000000002,
"calib/mu_c": 0.9222758620689656,
"calib/mu_w": 0.7513684210526316,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2533749999999999,
"calib/std_conf": 0.24396179900754952,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8930182481751826,
"calib/step_q_c_n": 1370.0,
"calib/step_q_gap": 0.08982194807240762,
"calib/step_q_w": 0.803196300102775,
"calib/step_q_w_n": 973.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 804.73046875,
"completions/mean_terminated_length": 834.0526733398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 426.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.05745590478181839,
"kl": 0.06463623046875,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0987,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.017795901745557785,
"mask/share_reasoning": 0.8279945254325867,
"mask/share_step_conf": 0.11905330419540405,
"num_tokens": 54099336.0,
"reward": 0.7860234975814819,
"reward_std": 0.24726131558418274,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.675326943397522,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.5959386825561523,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.6072914004325867,
"adv/mean_abs_reasoning": 0.37272268533706665,
"adv/mean_abs_step_conf": 0.7750673294067383,
"adv/ratio_final_to_reasoning": 1.6293384447029056,
"adv/ratio_step_to_reasoning": 2.0794745259624237,
"adv/std_final_conf": 0.8151358366012573,
"adv/std_reasoning": 0.6614524722099304,
"adv/std_step_conf": 0.9343709349632263,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6556508967223253,
"calib/avg_num_step_conf": 9.953125,
"calib/ece": 0.18306122448979573,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5714285714285714,
"calib/gap": 0.18631493506493502,
"calib/mean_conf": 0.8098367346938776,
"calib/mu_c": 0.8683928571428572,
"calib/mu_w": 0.6820779220779222,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15359183673469368,
"calib/std_conf": 0.2748758047524688,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8753670647391578,
"calib/step_q_c_n": 1591.0,
"calib/step_q_gap": -0.004841921676725036,
"calib/step_q_w": 0.8802089864158829,
"calib/step_q_w_n": 957.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 868.38671875,
"completions/mean_terminated_length": 892.7991333007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 425.0,
"epoch": 0.192,
"grad_norm": 0.044392045587301254,
"kl": 0.0648193359375,
"learning_rate": 5.555555555555555e-07,
"loss": -0.046,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.016595961526036263,
"mask/share_reasoning": 0.845601499080658,
"mask/share_step_conf": 0.11045877635478973,
"num_tokens": 54425499.0,
"reward": 0.8599202632904053,
"reward_std": 0.2037505954504013,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7405816316604614,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.656602680683136,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.684099018573761,
"adv/mean_abs_reasoning": 0.6443017721176147,
"adv/mean_abs_step_conf": 0.7614454627037048,
"adv/ratio_final_to_reasoning": 1.061768022653958,
"adv/ratio_step_to_reasoning": 1.1818149439525458,
"adv/std_final_conf": 0.876131534576416,
"adv/std_reasoning": 0.8590683937072754,
"adv/std_step_conf": 0.9356979131698608,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6834362139917696,
"calib/avg_num_step_conf": 9.94921875,
"calib/ece": 0.2602880658436213,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5925925925925926,
"calib/gap": 0.18396296296296322,
"calib/mean_conf": 0.8064609053497943,
"calib/mu_c": 0.8882222222222224,
"calib/mu_w": 0.7042592592592591,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25559670781892996,
"calib/std_conf": 0.2715491026283736,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9053762711864407,
"calib/step_q_c_n": 1180.0,
"calib/step_q_gap": 0.04352550308110048,
"calib/step_q_w": 0.8618507681053402,
"calib/step_q_w_n": 1367.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2480.0,
"completions/max_terminated_length": 2480.0,
"completions/mean_length": 743.5078125,
"completions/mean_terminated_length": 776.8897705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 426.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.07317768782377243,
"kl": 0.07617950439453125,
"learning_rate": 5.277777777777779e-07,
"loss": -0.1169,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.01819986291229725,
"mask/share_reasoning": 0.8194365501403809,
"mask/share_step_conf": 0.11939479410648346,
"num_tokens": 54722101.0,
"reward": 0.7572405934333801,
"reward_std": 0.29411470890045166,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6693332195281982,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.5506167411804199,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.6801034212112427,
"adv/mean_abs_reasoning": 0.4527340531349182,
"adv/mean_abs_step_conf": 0.7628204822540283,
"adv/ratio_final_to_reasoning": 1.5022139741906417,
"adv/ratio_step_to_reasoning": 1.684919605609393,
"adv/std_final_conf": 0.8764859437942505,
"adv/std_reasoning": 0.739275336265564,
"adv/std_step_conf": 0.9347026348114014,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7248870581982461,
"calib/avg_num_step_conf": 9.30078125,
"calib/ece": 0.25870967741935486,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.592741935483871,
"calib/gap": 0.22101249003454704,
"calib/mean_conf": 0.8058870967741936,
"calib/mu_c": 0.9003521126760565,
"calib/mu_w": 0.6793396226415095,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24600806451612905,
"calib/std_conf": 0.278820997314019,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9006148867313915,
"calib/step_q_c_n": 1236.0,
"calib/step_q_gap": 0.06895986489733019,
"calib/step_q_w": 0.8316550218340613,
"calib/step_q_w_n": 1145.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2822.0,
"completions/max_terminated_length": 2822.0,
"completions/mean_length": 770.171875,
"completions/mean_terminated_length": 785.5139770507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 413.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.05326711758971214,
"kl": 0.0720672607421875,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0435,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.018472693860530853,
"mask/share_reasoning": 0.8421061038970947,
"mask/share_step_conf": 0.11988990753889084,
"num_tokens": 55025425.0,
"reward": 0.784070611000061,
"reward_std": 0.24252068996429443,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7084202766418457,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": 0.5550333261489868,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6686722040176392,
"adv/mean_abs_reasoning": 0.522902250289917,
"adv/mean_abs_step_conf": 0.7876584529876709,
"adv/ratio_final_to_reasoning": 1.2787709436073411,
"adv/ratio_step_to_reasoning": 1.5063206412880474,
"adv/std_final_conf": 0.873296320438385,
"adv/std_reasoning": 0.7756194472312927,
"adv/std_step_conf": 0.9346939921379089,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6828478964401294,
"calib/avg_num_step_conf": 9.8125,
"calib/ece": 0.21331950207468878,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.5020746887966805,
"calib/gap": 0.21650415083720298,
"calib/mean_conf": 0.7426141078838175,
"calib/mu_c": 0.835144927536232,
"calib/mu_w": 0.6186407766990291,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1916597510373444,
"calib/std_conf": 0.31346843708064276,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9056351236146633,
"calib/step_q_c_n": 1173.0,
"calib/step_q_gap": 0.06097492943990601,
"calib/step_q_w": 0.8446601941747572,
"calib/step_q_w_n": 1339.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2101.0,
"completions/max_terminated_length": 2101.0,
"completions/mean_length": 807.72265625,
"completions/mean_terminated_length": 850.9341430664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.1952,
"grad_norm": 0.04934949427843094,
"kl": 0.0679779052734375,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.2123,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.016763310879468918,
"mask/share_reasoning": 0.8293523788452148,
"mask/share_step_conf": 0.10310307145118713,
"num_tokens": 55338882.0,
"reward": 0.7845203280448914,
"reward_std": 0.2688310742378235,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6910668611526489,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": 0.5818799734115601,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.6820645332336426,
"adv/mean_abs_reasoning": 0.4836152195930481,
"adv/mean_abs_step_conf": 0.7581703662872314,
"adv/ratio_final_to_reasoning": 1.4103454680512026,
"adv/ratio_step_to_reasoning": 1.5677140329148773,
"adv/std_final_conf": 0.8702086806297302,
"adv/std_reasoning": 0.7394979596138,
"adv/std_step_conf": 0.9349130988121033,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.736623475609756,
"calib/avg_num_step_conf": 9.62109375,
"calib/ece": 0.17921487603305797,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.6074380165289256,
"calib/gap": 0.24499542682926845,
"calib/mean_conf": 0.829297520661157,
"calib/mu_c": 0.9123125000000002,
"calib/mu_w": 0.6673170731707317,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17367768595041333,
"calib/std_conf": 0.2624517317173887,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.9109391553328561,
"calib/step_q_c_n": 1397.0,
"calib/step_q_gap": 0.1102127638381718,
"calib/step_q_w": 0.8007263914946843,
"calib/step_q_w_n": 1066.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3012.0,
"completions/max_terminated_length": 3012.0,
"completions/mean_length": 809.125,
"completions/mean_terminated_length": 838.6072998046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.0646803006529808,
"kl": 0.06688690185546875,
"learning_rate": 4.444444444444445e-07,
"loss": -0.1008,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.017484432086348534,
"mask/share_reasoning": 0.8337000608444214,
"mask/share_step_conf": 0.11365921050310135,
"num_tokens": 55651298.0,
"reward": 0.8425024151802063,
"reward_std": 0.24076125025749207,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.7454652786254883,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": 0.6254770755767822,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.6692211627960205,
"adv/mean_abs_reasoning": 0.5714598298072815,
"adv/mean_abs_step_conf": 0.7615892887115479,
"adv/ratio_final_to_reasoning": 1.1710729746685922,
"adv/ratio_step_to_reasoning": 1.3327083532159827,
"adv/std_final_conf": 0.8775772452354431,
"adv/std_reasoning": 0.8268687129020691,
"adv/std_step_conf": 0.9354509711265564,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.698912288822032,
"calib/avg_num_step_conf": 10.3828125,
"calib/ece": 0.24258474576271183,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.1505754840700454,
"calib/mean_conf": 0.7552966101694916,
"calib/mu_c": 0.8108053691275169,
"calib/mu_w": 0.6602298850574715,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18326271186440674,
"calib/std_conf": 0.29865006474961586,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8650375375375375,
"calib/step_q_c_n": 1332.0,
"calib/step_q_gap": 0.04064085578791454,
"calib/step_q_w": 0.824396681749623,
"calib/step_q_w_n": 1326.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2938.0,
"completions/max_terminated_length": 2938.0,
"completions/mean_length": 782.56640625,
"completions/mean_terminated_length": 834.737548828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.06889063119888306,
"kl": 0.06777191162109375,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.2824,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.016981428489089012,
"mask/share_reasoning": 0.8096021413803101,
"mask/share_step_conf": 0.1109163910150528,
"num_tokens": 55958555.0,
"reward": 0.7719869017601013,
"reward_std": 0.2715288996696472,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6732874512672424,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": 0.5706863403320312,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.6912783980369568,
"adv/mean_abs_reasoning": 0.5170702338218689,
"adv/mean_abs_step_conf": 0.7700475454330444,
"adv/ratio_final_to_reasoning": 1.336913929327254,
"adv/ratio_step_to_reasoning": 1.4892513532278218,
"adv/std_final_conf": 0.9046430587768555,
"adv/std_reasoning": 0.7755802273750305,
"adv/std_step_conf": 0.9337618350982666,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7176522446765223,
"calib/avg_num_step_conf": 9.65625,
"calib/ece": 0.25356275303643727,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5101214574898786,
"calib/gap": 0.22800217008002177,
"calib/mean_conf": 0.7499190283400811,
"calib/mu_c": 0.8431506849315069,
"calib/mu_w": 0.6151485148514851,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20619433198380568,
"calib/std_conf": 0.3146426440322263,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8626199750312111,
"calib/step_q_c_n": 1335.0,
"calib/step_q_gap": 0.03157922451816508,
"calib/step_q_w": 0.831040750513046,
"calib/step_q_w_n": 1137.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2869.0,
"completions/max_terminated_length": 2869.0,
"completions/mean_length": 793.97265625,
"completions/mean_terminated_length": 816.2931518554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 371.0,
"epoch": 0.1984,
"grad_norm": 0.054222866892814636,
"kl": 0.0751190185546875,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.1349,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.018149716779589653,
"mask/share_reasoning": 0.8339352607727051,
"mask/share_step_conf": 0.12057129293680191,
"num_tokens": 56266852.0,
"reward": 0.8087828755378723,
"reward_std": 0.257120281457901,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7117167711257935,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6003800630569458,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.7086143493652344,
"adv/mean_abs_reasoning": 0.4930305480957031,
"adv/mean_abs_step_conf": 0.775668203830719,
"adv/ratio_final_to_reasoning": 1.4372625633486789,
"adv/ratio_step_to_reasoning": 1.5732660112576888,
"adv/std_final_conf": 0.8794147372245789,
"adv/std_reasoning": 0.7395173907279968,
"adv/std_step_conf": 0.9348717331886292,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6558219178082191,
"calib/avg_num_step_conf": 10.26953125,
"calib/ece": 0.2400819672131147,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.4713114754098361,
"calib/gap": 0.14184232597148483,
"calib/mean_conf": 0.7438524590163934,
"calib/mu_c": 0.8008219178082193,
"calib/mu_w": 0.6589795918367345,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1927868852459016,
"calib/std_conf": 0.28849509676008184,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8616146230007617,
"calib/step_q_c_n": 1313.0,
"calib/step_q_gap": 0.14515945582750944,
"calib/step_q_w": 0.7164551671732523,
"calib/step_q_w_n": 1316.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2859.0,
"completions/max_terminated_length": 2859.0,
"completions/mean_length": 860.22265625,
"completions/mean_terminated_length": 877.3585815429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 451.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.05153043195605278,
"kl": 0.0677947998046875,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0226,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.0167331974953413,
"mask/share_reasoning": 0.8521455526351929,
"mask/share_step_conf": 0.11158999800682068,
"num_tokens": 56588613.0,
"reward": 0.7962475419044495,
"reward_std": 0.2418176233768463,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6870421171188354,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": 0.6015465259552002,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.728238046169281,
"adv/mean_abs_reasoning": 0.46052050590515137,
"adv/mean_abs_step_conf": 0.7711305022239685,
"adv/ratio_final_to_reasoning": 1.5813368500017861,
"adv/ratio_step_to_reasoning": 1.6744759295969118,
"adv/std_final_conf": 0.9081566333770752,
"adv/std_reasoning": 0.7394147515296936,
"adv/std_step_conf": 0.9348183274269104,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6867738399707709,
"calib/avg_num_step_conf": 9.64453125,
"calib/ece": 0.1543089430894308,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.483739837398374,
"calib/gap": 0.2231348191450493,
"calib/mean_conf": 0.7454471544715447,
"calib/mu_c": 0.8225465838509317,
"calib/mu_w": 0.5994117647058824,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.12264227642276412,
"calib/std_conf": 0.30079367109898403,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8945784381478922,
"calib/step_q_c_n": 1447.0,
"calib/step_q_gap": 0.05269879039838132,
"calib/step_q_w": 0.8418796477495109,
"calib/step_q_w_n": 1022.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2262.0,
"completions/max_terminated_length": 2262.0,
"completions/mean_length": 820.75,
"completions/mean_terminated_length": 843.8232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 342.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.10698122531175613,
"kl": 0.06858062744140625,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.1138,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.017628012225031853,
"mask/share_reasoning": 0.8367440104484558,
"mask/share_step_conf": 0.11828421801328659,
"num_tokens": 56902797.0,
"reward": 0.8345286846160889,
"reward_std": 0.2370331734418869,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7457132935523987,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": 0.605375349521637,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.6068315505981445,
"adv/mean_abs_reasoning": 0.39968180656433105,
"adv/mean_abs_step_conf": 0.7506299018859863,
"adv/ratio_final_to_reasoning": 1.5182866486079885,
"adv/ratio_step_to_reasoning": 1.8780687275670833,
"adv/std_final_conf": 0.8442689776420593,
"adv/std_reasoning": 0.7012975215911865,
"adv/std_step_conf": 0.9340417981147766,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7970951343500363,
"calib/avg_num_step_conf": 9.78515625,
"calib/ece": 0.10344129554655862,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4048582995951417,
"calib/gap": 0.3640530137981117,
"calib/mean_conf": 0.6695951417004048,
"calib/mu_c": 0.7948765432098766,
"calib/mu_w": 0.4308235294117648,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.05858299595141691,
"calib/std_conf": 0.32990956627493656,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8672979166666668,
"calib/step_q_c_n": 1440.0,
"calib/step_q_gap": 0.03278617957746488,
"calib/step_q_w": 0.8345117370892019,
"calib/step_q_w_n": 1065.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2958.0,
"completions/max_terminated_length": 2958.0,
"completions/mean_length": 811.11328125,
"completions/mean_terminated_length": 823.9881591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.2016,
"grad_norm": 0.07192602753639221,
"kl": 0.074310302734375,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0406,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.018070422112941742,
"mask/share_reasoning": 0.8476791381835938,
"mask/share_step_conf": 0.1186254620552063,
"num_tokens": 57218210.0,
"reward": 0.8747564554214478,
"reward_std": 0.18241506814956665,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.800437867641449,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": 0.6295437812805176,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.6713325381278992,
"adv/mean_abs_reasoning": 0.45265400409698486,
"adv/mean_abs_step_conf": 0.7748057842254639,
"adv/ratio_final_to_reasoning": 1.4831030589625815,
"adv/ratio_step_to_reasoning": 1.7116954168364218,
"adv/std_final_conf": 0.8905377388000488,
"adv/std_reasoning": 0.7206981778144836,
"adv/std_step_conf": 0.934777557849884,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6814162422184493,
"calib/avg_num_step_conf": 9.87890625,
"calib/ece": 0.16971428571428568,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4448979591836735,
"calib/gap": 0.20692062818336154,
"calib/mean_conf": 0.7190204081632652,
"calib/mu_c": 0.7975657894736843,
"calib/mu_w": 0.5906451612903227,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1341632653061224,
"calib/std_conf": 0.311720471969249,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8788965093411997,
"calib/step_q_c_n": 1356.0,
"calib/step_q_gap": 0.049893951796442626,
"calib/step_q_w": 0.829002557544757,
"calib/step_q_w_n": 1173.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2626.0,
"completions/max_terminated_length": 2626.0,
"completions/mean_length": 827.65625,
"completions/mean_terminated_length": 850.9236450195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.04944543167948723,
"kl": 0.07180023193359375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.1137,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.017103388905525208,
"mask/share_reasoning": 0.8414227962493896,
"mask/share_step_conf": 0.11413010209798813,
"num_tokens": 57535698.0,
"reward": 0.8249231576919556,
"reward_std": 0.22116057574748993,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7226202487945557,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6170697212219238,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.690582811832428,
"adv/mean_abs_reasoning": 0.426123708486557,
"adv/mean_abs_step_conf": 0.7383967638015747,
"adv/ratio_final_to_reasoning": 1.6206157932050709,
"adv/ratio_step_to_reasoning": 1.7328225327431388,
"adv/std_final_conf": 0.8894121646881104,
"adv/std_reasoning": 0.7014713287353516,
"adv/std_step_conf": 0.935016930103302,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7255667164381703,
"calib/avg_num_step_conf": 10.13671875,
"calib/ece": 0.2516326530612245,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.6081632653061224,
"calib/gap": 0.25055314239174675,
"calib/mean_conf": 0.7822448979591836,
"calib/mu_c": 0.8906474820143884,
"calib/mu_w": 0.6400943396226416,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.233265306122449,
"calib/std_conf": 0.3042962124689507,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.9067540983606558,
"calib/step_q_c_n": 1220.0,
"calib/step_q_gap": 0.06829591654247391,
"calib/step_q_w": 0.8384581818181819,
"calib/step_q_w_n": 1375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 1977.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 755.5546875,
"completions/mean_terminated_length": 789.4775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 409.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.0591316819190979,
"kl": 0.071075439453125,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.2139,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01827659085392952,
"mask/share_reasoning": 0.8171859979629517,
"mask/share_step_conf": 0.1215687245130539,
"num_tokens": 57833288.0,
"reward": 0.7744136452674866,
"reward_std": 0.23828265070915222,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7037370800971985,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.545871376991272,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.7076407074928284,
"adv/mean_abs_reasoning": 0.43131551146507263,
"adv/mean_abs_step_conf": 0.7656826376914978,
"adv/ratio_final_to_reasoning": 1.6406567551654867,
"adv/ratio_step_to_reasoning": 1.775226295689349,
"adv/std_final_conf": 0.890299916267395,
"adv/std_reasoning": 0.7208864688873291,
"adv/std_step_conf": 0.9349159002304077,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7080642557313529,
"calib/avg_num_step_conf": 9.7890625,
"calib/ece": 0.14920502092050214,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5481171548117155,
"calib/gap": 0.21411365837907648,
"calib/mean_conf": 0.7823430962343098,
"calib/mu_c": 0.8504294478527608,
"calib/mu_w": 0.6363157894736843,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.1247698744769875,
"calib/std_conf": 0.28081089890896666,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8886009421265142,
"calib/step_q_c_n": 1486.0,
"calib/step_q_gap": 0.15244407938141602,
"calib/step_q_w": 0.7361568627450982,
"calib/step_q_w_n": 1020.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2928.0,
"completions/max_terminated_length": 2928.0,
"completions/mean_length": 780.51171875,
"completions/mean_terminated_length": 818.8975219726562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.2048,
"grad_norm": 0.04957636818289757,
"kl": 0.07782745361328125,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.2414,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.018003789708018303,
"mask/share_reasoning": 0.8114763498306274,
"mask/share_step_conf": 0.1236448884010315,
"num_tokens": 58138075.0,
"reward": 0.8275803923606873,
"reward_std": 0.2583814859390259,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7309035062789917,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l1_reward": 0.6109760403633118,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.7112095355987549,
"adv/mean_abs_reasoning": 0.4996627867221832,
"adv/mean_abs_step_conf": 0.7590423226356506,
"adv/ratio_final_to_reasoning": 1.4233790358179974,
"adv/ratio_step_to_reasoning": 1.5191091728383699,
"adv/std_final_conf": 0.8913341760635376,
"adv/std_reasoning": 0.7754184007644653,
"adv/std_step_conf": 0.9352988004684448,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6429493713897384,
"calib/avg_num_step_conf": 9.7421875,
"calib/ece": 0.23543032786885248,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4918032786885246,
"calib/gap": 0.16506931702344563,
"calib/mean_conf": 0.7448155737704918,
"calib/mu_c": 0.8185555555555556,
"calib/mu_w": 0.65348623853211,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21348360655737708,
"calib/std_conf": 0.30274540716703924,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8925401459854014,
"calib/step_q_c_n": 1233.0,
"calib/step_q_gap": 0.09319438864995322,
"calib/step_q_w": 0.7993457573354482,
"calib/step_q_w_n": 1261.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2211.0,
"completions/max_terminated_length": 2211.0,
"completions/mean_length": 782.25,
"completions/mean_terminated_length": 817.3713989257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 419.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.07504615932703018,
"kl": 0.072113037109375,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.1529,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.017368696630001068,
"mask/share_reasoning": 0.827225923538208,
"mask/share_step_conf": 0.11243665218353271,
"num_tokens": 58444043.0,
"reward": 0.7569782137870789,
"reward_std": 0.22785191237926483,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6729967594146729,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.5448658466339111,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.5718756914138794,
"adv/mean_abs_reasoning": 0.4823354482650757,
"adv/mean_abs_step_conf": 0.7542473673820496,
"adv/ratio_final_to_reasoning": 1.185638943749362,
"adv/ratio_step_to_reasoning": 1.563740276803251,
"adv/std_final_conf": 0.831807017326355,
"adv/std_reasoning": 0.7577318549156189,
"adv/std_step_conf": 0.9345336556434631,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7718676122931442,
"calib/avg_num_step_conf": 9.95703125,
"calib/ece": 0.1869166666666666,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6125,
"calib/gap": 0.3344895766172362,
"calib/mean_conf": 0.7669166666666667,
"calib/mu_c": 0.9048936170212766,
"calib/mu_w": 0.5704040404040404,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.18316666666666662,
"calib/std_conf": 0.319433863351329,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8862143432715551,
"calib/step_q_c_n": 1241.0,
"calib/step_q_gap": 0.04246816590152458,
"calib/step_q_w": 0.8437461773700305,
"calib/step_q_w_n": 1308.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 753.66796875,
"completions/mean_terminated_length": 793.9876098632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 440.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.051577258855104446,
"kl": 0.0688323974609375,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.2472,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.018072867766022682,
"mask/share_reasoning": 0.816690981388092,
"mask/share_step_conf": 0.11445486545562744,
"num_tokens": 58742926.0,
"reward": 0.8116989135742188,
"reward_std": 0.2461298704147339,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7364538908004761,
"rewards/format_reward_step": 0.9375,
"rewards/step_l1_reward": 0.5892876386642456,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.7048993110656738,
"adv/mean_abs_reasoning": 0.4602048397064209,
"adv/mean_abs_step_conf": 0.7671737670898438,
"adv/ratio_final_to_reasoning": 1.5317077315296188,
"adv/ratio_step_to_reasoning": 1.6670267257059876,
"adv/std_final_conf": 0.8785596489906311,
"adv/std_reasoning": 0.7207231521606445,
"adv/std_step_conf": 0.934549868106842,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6614285714285715,
"calib/avg_num_step_conf": 9.59375,
"calib/ece": 0.25583673469387763,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5306122448979592,
"calib/gap": 0.16304761904761922,
"calib/mean_conf": 0.7369795918367347,
"calib/mu_c": 0.8068571428571429,
"calib/mu_w": 0.6438095238095237,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21069387755102048,
"calib/std_conf": 0.32297699986322514,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8810873142250532,
"calib/step_q_c_n": 1256.0,
"calib/step_q_gap": 0.02071314755838638,
"calib/step_q_w": 0.8603741666666668,
"calib/step_q_w_n": 1200.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 1819.0,
"completions/max_terminated_length": 1819.0,
"completions/mean_length": 786.08984375,
"completions/mean_terminated_length": 814.7327880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.208,
"grad_norm": 0.0542498379945755,
"kl": 0.07525634765625,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.179,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01755117066204548,
"mask/share_reasoning": 0.8345537185668945,
"mask/share_step_conf": 0.11273886263370514,
"num_tokens": 59050149.0,
"reward": 0.7701911926269531,
"reward_std": 0.21878653764724731,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6730234622955322,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.5665776133537292,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.4969114065170288,
"adv/mean_abs_reasoning": 0.30748242139816284,
"adv/mean_abs_step_conf": 0.7540386915206909,
"adv/ratio_final_to_reasoning": 1.6160644379522822,
"adv/ratio_step_to_reasoning": 2.4522985349600743,
"adv/std_final_conf": 0.7552186846733093,
"adv/std_reasoning": 0.6184966564178467,
"adv/std_step_conf": 0.932518482208252,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7099794941900205,
"calib/avg_num_step_conf": 9.1015625,
"calib/ece": 0.2502008032128514,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7028112449799196,
"calib/gap": 0.20612098427887915,
"calib/mean_conf": 0.8511646586345383,
"calib/mu_c": 0.9298051948051947,
"calib/mu_w": 0.7236842105263156,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24144578313253012,
"calib/std_conf": 0.25713092823111716,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.904220788530466,
"calib/step_q_c_n": 1395.0,
"calib/step_q_gap": 0.06034913077645532,
"calib/step_q_w": 0.8438716577540106,
"calib/step_q_w_n": 935.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2028.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 727.0546875,
"completions/mean_terminated_length": 747.4939575195312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 460.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.04341113567352295,
"kl": 0.06845855712890625,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0891,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.019082296639680862,
"mask/share_reasoning": 0.8294082283973694,
"mask/share_step_conf": 0.12416577339172363,
"num_tokens": 59338819.0,
"reward": 0.8234367370605469,
"reward_std": 0.16358217597007751,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.720785915851593,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": 0.6112438440322876,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.6498538255691528,
"adv/mean_abs_reasoning": 0.533227801322937,
"adv/mean_abs_step_conf": 0.7692816257476807,
"adv/ratio_final_to_reasoning": 1.218717073560056,
"adv/ratio_step_to_reasoning": 1.4426885166885421,
"adv/std_final_conf": 0.8588353395462036,
"adv/std_reasoning": 0.7929235696792603,
"adv/std_step_conf": 0.9352068901062012,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7199796126401631,
"calib/avg_num_step_conf": 9.53515625,
"calib/ece": 0.22280737704918027,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4385245901639344,
"calib/gap": 0.20918008834522617,
"calib/mean_conf": 0.7551844262295082,
"calib/mu_c": 0.8486296296296298,
"calib/mu_w": 0.6394495412844037,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21235655737704914,
"calib/std_conf": 0.2742819739481024,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8934718826405867,
"calib/step_q_c_n": 1227.0,
"calib/step_q_gap": 0.08670499631439232,
"calib/step_q_w": 0.8067668863261944,
"calib/step_q_w_n": 1214.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 1723.0,
"completions/max_terminated_length": 1723.0,
"completions/mean_length": 766.0390625,
"completions/mean_terminated_length": 803.7130737304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 468.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.0639037936925888,
"kl": 0.075775146484375,
"learning_rate": 8.333333333333334e-08,
"loss": -0.2186,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.017740493640303612,
"mask/share_reasoning": 0.8179367780685425,
"mask/share_step_conf": 0.11744774132966995,
"num_tokens": 59639981.0,
"reward": 0.7840213775634766,
"reward_std": 0.22477491199970245,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7055456042289734,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": 0.5664034485816956,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.6817752718925476,
"adv/mean_abs_reasoning": 0.5082120895385742,
"adv/mean_abs_step_conf": 0.7704319953918457,
"adv/ratio_final_to_reasoning": 1.3415172246523261,
"adv/ratio_step_to_reasoning": 1.5159655019056144,
"adv/std_final_conf": 0.8869988322257996,
"adv/std_reasoning": 0.7577569484710693,
"adv/std_step_conf": 0.9344480037689209,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.78292589763178,
"calib/avg_num_step_conf": 9.609375,
"calib/ece": 0.15046025104602515,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.5271966527196653,
"calib/gap": 0.29537280366692154,
"calib/mean_conf": 0.7789121338912134,
"calib/mu_c": 0.8839610389610392,
"calib/mu_w": 0.5885882352941176,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.14251046025104605,
"calib/std_conf": 0.2863236688183009,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.8935914985590778,
"calib/step_q_c_n": 1388.0,
"calib/step_q_gap": 0.04145530452922708,
"calib/step_q_w": 0.8521361940298507,
"calib/step_q_w_n": 1072.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2894.0,
"completions/max_terminated_length": 2894.0,
"completions/mean_length": 734.08203125,
"completions/mean_terminated_length": 783.0208740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 383.0,
"epoch": 0.2112,
"grad_norm": 0.06836410611867905,
"kl": 0.0736541748046875,
"learning_rate": 5.555555555555556e-08,
"loss": -0.2358,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.018041130155324936,
"mask/share_reasoning": 0.7997227311134338,
"mask/share_step_conf": 0.11973617970943451,
"num_tokens": 59933290.0,
"reward": 0.8341923952102661,
"reward_std": 0.22902759909629822,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7511105537414551,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l1_reward": 0.6118054389953613,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.6972668170928955,
"adv/mean_abs_reasoning": 0.6235437393188477,
"adv/mean_abs_step_conf": 0.7461246252059937,
"adv/ratio_final_to_reasoning": 1.118232407969619,
"adv/ratio_step_to_reasoning": 1.1965874695832117,
"adv/std_final_conf": 0.8916330337524414,
"adv/std_reasoning": 0.8591709136962891,
"adv/std_step_conf": 0.9358724355697632,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.6319540229885057,
"calib/avg_num_step_conf": 9.6484375,
"calib/ece": 0.2516595744680851,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.13508045977011496,
"calib/mean_conf": 0.7966808510638297,
"calib/mu_c": 0.8484137931034483,
"calib/mu_w": 0.7133333333333334,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2156595744680851,
"calib/std_conf": 0.28745553634953547,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8878342455043002,
"calib/step_q_c_n": 1279.0,
"calib/step_q_gap": 0.04233886347239424,
"calib/step_q_w": 0.845495382031906,
"calib/step_q_w_n": 1191.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 781.07421875,
"completions/mean_terminated_length": 833.1458740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.05004898086190224,
"kl": 0.06656646728515625,
"learning_rate": 2.777777777777778e-08,
"loss": -0.2956,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.017658531665802002,
"mask/share_reasoning": 0.8055081367492676,
"mask/share_step_conf": 0.11433329433202744,
"num_tokens": 60237445.0,
"reward": 0.7555822134017944,
"reward_std": 0.3044376075267792,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6541687250137329,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l1_reward": 0.560120701789856,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.6082898378372192,
"adv/mean_abs_reasoning": 0.4397979974746704,
"adv/mean_abs_step_conf": 0.7207082509994507,
"adv/ratio_final_to_reasoning": 1.3831118862069236,
"adv/ratio_step_to_reasoning": 1.6387256311710672,
"adv/std_final_conf": 0.8417679071426392,
"adv/std_reasoning": 0.7392775416374207,
"adv/std_step_conf": 0.9338216781616211,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7269190483315876,
"calib/avg_num_step_conf": 9.04296875,
"calib/ece": 0.1732244897959183,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5959183673469388,
"calib/gap": 0.2493947329043843,
"calib/mean_conf": 0.7872653061224489,
"calib/mu_c": 0.8707361963190184,
"calib/mu_w": 0.6213414634146341,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.1475918367346938,
"calib/std_conf": 0.2898772981452813,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8987449617790131,
"calib/step_q_c_n": 1439.0,
"calib/step_q_gap": 0.036130806527871484,
"calib/step_q_w": 0.8626141552511416,
"calib/step_q_w_n": 876.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 3028.0,
"completions/max_terminated_length": 3028.0,
"completions/mean_length": 806.62109375,
"completions/mean_terminated_length": 836.0121459960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 469.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.045651182532310486,
"kl": 0.06322479248046875,
"learning_rate": 0.0,
"loss": -0.1477,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.01771700009703636,
"mask/share_reasoning": 0.8353173732757568,
"mask/share_step_conf": 0.11180936545133591,
"num_tokens": 60551988.0,
"reward": 0.8539600372314453,
"reward_std": 0.1936844438314438,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.755567193031311,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": 0.6336028575897217,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.1237042972794734,
"train_runtime": 21618.204,
"train_samples_per_second": 2.368,
"train_steps_per_second": 0.009
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 60551988,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}