Files
PureRL-1.5B-v7-s2-l1-maskon/trainer_state.json
ModelHub XC facef0725d 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l1-maskon
Source: Original Platform
2026-05-31 00:44:22 +08:00

11684 lines
467 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7603062391281128,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5934460932450047,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9350208044052124,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04540952667593956,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": 0.0801,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 0.3269073963165283,
"reward_std": 0.18100249767303467,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3909340500831604,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.7713351249694824,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.5110743681990095,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9351165890693665,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.03894897922873497,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0068,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 0.25607970356941223,
"reward_std": 0.1939290463924408,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l1_reward": -0.4261667728424072,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7773253917694092,
"adv/mean_abs_reasoning": 0.4774738848209381,
"adv/mean_abs_step_conf": 0.7686185836791992,
"adv/ratio_final_to_reasoning": 1.6279956171025378,
"adv/ratio_step_to_reasoning": 1.609760466726774,
"adv/std_final_conf": 0.9283453822135925,
"adv/std_reasoning": 0.7393098473548889,
"adv/std_step_conf": 0.93489009141922,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.583791928721174,
"calib/avg_num_step_conf": 5.0078125,
"calib/ece": 0.2604313725490196,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.3215686274509804,
"calib/gap": 0.014355345911949535,
"calib/mean_conf": 0.8814509803921569,
"calib/mu_c": 0.8868553459119496,
"calib/mu_w": 0.8725,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2591764705882353,
"calib/std_conf": 0.046541701193867246,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7877590673575129,
"calib/step_q_c_n": 772.0,
"calib/step_q_gap": 0.018406126181042315,
"calib/step_q_w": 0.7693529411764706,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2468.0,
"completions/max_terminated_length": 2468.0,
"completions/mean_length": 505.6015625,
"completions/mean_terminated_length": 505.6015625,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.060042351484298706,
"kl": 0.0027255117893218994,
"learning_rate": 7.5e-07,
"loss": 0.0702,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03299238905310631,
"mask/share_reasoning": 0.8572773933410645,
"mask/share_step_conf": 0.10973025858402252,
"num_tokens": 693351.0,
"reward": 0.3012976050376892,
"reward_std": 0.18565943837165833,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6995797157287598,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.41885948181152344,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.762697160243988,
"adv/mean_abs_reasoning": 0.3705032169818878,
"adv/mean_abs_step_conf": 0.7731007933616638,
"adv/ratio_final_to_reasoning": 2.0585439620657136,
"adv/ratio_step_to_reasoning": 2.0866236996788534,
"adv/std_final_conf": 0.9276609420776367,
"adv/std_reasoning": 0.6612005233764648,
"adv/std_step_conf": 0.9348263740539551,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5453267973856208,
"calib/avg_num_step_conf": 5.1640625,
"calib/ece": 0.27134387351778666,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.25296442687747034,
"calib/gap": 0.008577124183006268,
"calib/mean_conf": 0.8760869565217392,
"calib/mu_c": 0.8794771241830065,
"calib/mu_w": 0.8709000000000002,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.27134387351778666,
"calib/std_conf": 0.04777359677934313,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7998416886543536,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.032412610640169204,
"calib/step_q_w": 0.7674290780141844,
"calib/step_q_w_n": 564.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2156.0,
"completions/max_terminated_length": 2156.0,
"completions/mean_length": 505.703125,
"completions/mean_terminated_length": 507.6863098144531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.04558708891272545,
"kl": 0.0002855062484741211,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0077,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03333834931254387,
"mask/share_reasoning": 0.847872793674469,
"mask/share_step_conf": 0.11488261073827744,
"num_tokens": 928979.0,
"reward": 0.28353989124298096,
"reward_std": 0.15577252209186554,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6810855865478516,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.4311932921409607,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7652193307876587,
"adv/mean_abs_reasoning": 0.4220971465110779,
"adv/mean_abs_step_conf": 0.7900995016098022,
"adv/ratio_final_to_reasoning": 1.8128986114043195,
"adv/ratio_step_to_reasoning": 1.8718427929222359,
"adv/std_final_conf": 0.9303828477859497,
"adv/std_reasoning": 0.7013581991195679,
"adv/std_step_conf": 0.9348316192626953,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.40712190463196674,
"calib/avg_num_step_conf": 4.57421875,
"calib/ece": 0.3155327868852459,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.2827868852459016,
"calib/gap": -0.014150351320008192,
"calib/mean_conf": 0.8770081967213115,
"calib/mu_c": 0.8708029197080291,
"calib/mu_w": 0.8849532710280373,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3155327868852459,
"calib/std_conf": 0.04733539137977054,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7995927601809957,
"calib/step_q_c_n": 663.0,
"calib/step_q_gap": 0.013096697188869522,
"calib/step_q_w": 0.7864960629921262,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2997.0,
"completions/max_terminated_length": 2997.0,
"completions/mean_length": 518.48046875,
"completions/mean_terminated_length": 524.6284790039062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.04099205508828163,
"kl": 0.0003192722797393799,
"learning_rate": 1.25e-06,
"loss": 0.0557,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03335411101579666,
"mask/share_reasoning": 0.8477334976196289,
"mask/share_step_conf": 0.10719365626573563,
"num_tokens": 1168398.0,
"reward": 0.24281755089759827,
"reward_std": 0.16069456934928894,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6147746443748474,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.42679572105407715,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.781194806098938,
"adv/mean_abs_reasoning": 0.3777908384799957,
"adv/mean_abs_step_conf": 0.7725051641464233,
"adv/ratio_final_to_reasoning": 2.0677971155732586,
"adv/ratio_step_to_reasoning": 2.04479591737725,
"adv/std_final_conf": 0.9307109117507935,
"adv/std_reasoning": 0.6403254866600037,
"adv/std_step_conf": 0.9347091317176819,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5274223602484474,
"calib/avg_num_step_conf": 5.0234375,
"calib/ece": 0.3353725490196079,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.2784313725490196,
"calib/gap": 0.003583850931676902,
"calib/mean_conf": 0.8810980392156863,
"calib/mu_c": 0.8827142857142857,
"calib/mu_w": 0.8791304347826088,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3337254901960785,
"calib/std_conf": 0.04119263921584887,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7980880121396056,
"calib/step_q_c_n": 659.0,
"calib/step_q_gap": 0.007609543240084005,
"calib/step_q_w": 0.7904784688995216,
"calib/step_q_w_n": 627.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1241.0,
"completions/max_terminated_length": 1241.0,
"completions/mean_length": 427.6875,
"completions/mean_terminated_length": 429.36474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.0064,
"grad_norm": 1.3526962995529175,
"kl": 0.9144148826599121,
"learning_rate": 1.5e-06,
"loss": 0.0894,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.037748031318187714,
"mask/share_reasoning": 0.8306044340133667,
"mask/share_step_conf": 0.12774130702018738,
"num_tokens": 1383838.0,
"reward": 0.24301785230636597,
"reward_std": 0.16260243952274323,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6358148455619812,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.4575916826725006,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7746590375900269,
"adv/mean_abs_reasoning": 0.45679908990859985,
"adv/mean_abs_step_conf": 0.7530906200408936,
"adv/ratio_final_to_reasoning": 1.6958419022792428,
"adv/ratio_step_to_reasoning": 1.6486254825761106,
"adv/std_final_conf": 0.9307973980903625,
"adv/std_reasoning": 0.7205731272697449,
"adv/std_step_conf": 0.9347212910652161,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4665942769545222,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.2958823529411765,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3058823529411765,
"calib/gap": -0.009091722023505722,
"calib/mean_conf": 0.8838039215686274,
"calib/mu_c": 0.8801315789473683,
"calib/mu_w": 0.889223300970874,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.29180392156862744,
"calib/std_conf": 0.044914306034737804,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7973768472906403,
"calib/step_q_c_n": 812.0,
"calib/step_q_gap": 0.003001847290640236,
"calib/step_q_w": 0.794375,
"calib/step_q_w_n": 608.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2503.0,
"completions/max_terminated_length": 2503.0,
"completions/mean_length": 550.87890625,
"completions/mean_terminated_length": 553.0392456054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.03561462089419365,
"kl": 0.0003936886787414551,
"learning_rate": 1.75e-06,
"loss": 0.1121,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.030144359916448593,
"mask/share_reasoning": 0.8576929569244385,
"mask/share_step_conf": 0.10825636237859726,
"num_tokens": 1632287.0,
"reward": 0.2793276011943817,
"reward_std": 0.17717917263507843,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6590094566345215,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.41597920656204224,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.752838671207428,
"adv/mean_abs_reasoning": 0.4558185935020447,
"adv/mean_abs_step_conf": 0.7695530652999878,
"adv/ratio_final_to_reasoning": 1.6516190474446957,
"adv/ratio_step_to_reasoning": 1.6882880081471179,
"adv/std_final_conf": 0.9314465522766113,
"adv/std_reasoning": 0.7391924858093262,
"adv/std_step_conf": 0.9353874325752258,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.44841897233201583,
"calib/avg_num_step_conf": 4.8828125,
"calib/ece": 0.3188306451612902,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.2661290322580645,
"calib/gap": -0.0006363636363636571,
"calib/mean_conf": 0.8752822580645161,
"calib/mu_c": 0.875,
"calib/mu_w": 0.8756363636363637,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3188306451612902,
"calib/std_conf": 0.062132151108365445,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8010367892976589,
"calib/step_q_c_n": 598.0,
"calib/step_q_gap": 0.04812267886821109,
"calib/step_q_w": 0.7529141104294478,
"calib/step_q_w_n": 652.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2669.0,
"completions/max_terminated_length": 2669.0,
"completions/mean_length": 549.87890625,
"completions/mean_terminated_length": 552.0353393554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.041965585201978683,
"kl": 0.00045447051525115967,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.068,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03198568522930145,
"mask/share_reasoning": 0.8615812063217163,
"mask/share_step_conf": 0.10252687335014343,
"num_tokens": 1879568.0,
"reward": 0.2684253752231598,
"reward_std": 0.17151033878326416,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.627129316329956,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3918410539627075,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.8019947409629822,
"adv/mean_abs_reasoning": 0.4327741265296936,
"adv/mean_abs_step_conf": 0.7716740369796753,
"adv/ratio_final_to_reasoning": 1.8531485405423274,
"adv/ratio_step_to_reasoning": 1.7830872727247686,
"adv/std_final_conf": 0.9300606846809387,
"adv/std_reasoning": 0.6817935109138489,
"adv/std_step_conf": 0.935168981552124,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4441333333333333,
"calib/avg_num_step_conf": 4.87890625,
"calib/ece": 0.2805599999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.24,
"calib/gap": -0.004000000000000226,
"calib/mean_conf": 0.8767999999999999,
"calib/mu_c": 0.8752,
"calib/mu_w": 0.8792000000000002,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.27867999999999993,
"calib/std_conf": 0.04814311996536993,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.7803221083455345,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.07846698467062285,
"calib/step_q_w": 0.7018551236749117,
"calib/step_q_w_n": 566.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2870.0,
"completions/max_terminated_length": 2870.0,
"completions/mean_length": 475.75,
"completions/mean_terminated_length": 481.3913269042969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.0096,
"grad_norm": 0.03932953625917435,
"kl": 0.00033918023109436035,
"learning_rate": 2.25e-06,
"loss": 0.0134,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0339675210416317,
"mask/share_reasoning": 0.8485729694366455,
"mask/share_step_conf": 0.10574081540107727,
"num_tokens": 2108896.0,
"reward": 0.2592368423938751,
"reward_std": 0.19198694825172424,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.651790976524353,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.44347357749938965,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7341861128807068,
"adv/mean_abs_reasoning": 0.43882063031196594,
"adv/mean_abs_step_conf": 0.7601568698883057,
"adv/ratio_final_to_reasoning": 1.6730893266316123,
"adv/ratio_step_to_reasoning": 1.732272407858071,
"adv/std_final_conf": 0.9312600493431091,
"adv/std_reasoning": 0.7391703724861145,
"adv/std_step_conf": 0.9348811507225037,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4770261437908497,
"calib/avg_num_step_conf": 4.9921875,
"calib/ece": 0.285952380952381,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3055555555555556,
"calib/gap": 0.004470588235293782,
"calib/mean_conf": 0.8811904761904763,
"calib/mu_c": 0.8829999999999998,
"calib/mu_w": 0.878529411764706,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.285952380952381,
"calib/std_conf": 0.07005788018462818,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7861420612813369,
"calib/step_q_c_n": 718.0,
"calib/step_q_gap": 0.003445632709908275,
"calib/step_q_w": 0.7826964285714286,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 2653.0,
"completions/mean_length": 534.4375,
"completions/mean_terminated_length": 534.4375,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.2811925411224365,
"kl": 0.48865118622779846,
"learning_rate": 2.5e-06,
"loss": 0.1109,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03223879635334015,
"mask/share_reasoning": 0.8597942590713501,
"mask/share_step_conf": 0.10796696692705154,
"num_tokens": 2352512.0,
"reward": 0.2733853757381439,
"reward_std": 0.18411020934581757,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6640077829360962,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.4312995672225952,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7714605331420898,
"adv/mean_abs_reasoning": 0.3798549175262451,
"adv/mean_abs_step_conf": 0.7588397264480591,
"adv/ratio_final_to_reasoning": 2.0309347004538587,
"adv/ratio_step_to_reasoning": 1.9977093659597784,
"adv/std_final_conf": 0.9286066293716431,
"adv/std_reasoning": 0.6613515615463257,
"adv/std_step_conf": 0.9348770380020142,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.3845270890725436,
"calib/avg_num_step_conf": 5.2890625,
"calib/ece": 0.30011857707509887,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.31620553359683795,
"calib/gap": -0.03282106782106786,
"calib/mean_conf": 0.8771936758893281,
"calib/mu_c": 0.8643506493506494,
"calib/mu_w": 0.8971717171717173,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.284308300395257,
"calib/std_conf": 0.0929895975711709,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7925751633986927,
"calib/step_q_c_n": 765.0,
"calib/step_q_gap": -0.0025861269238878215,
"calib/step_q_w": 0.7951612903225805,
"calib/step_q_w_n": 589.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1925.0,
"completions/max_terminated_length": 1925.0,
"completions/mean_length": 509.4140625,
"completions/mean_terminated_length": 511.41180419921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.04469301551580429,
"kl": 0.00034499168395996094,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.1212,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0329505056142807,
"mask/share_reasoning": 0.8455782532691956,
"mask/share_step_conf": 0.11756500601768494,
"num_tokens": 2587402.0,
"reward": 0.27336350083351135,
"reward_std": 0.1707531213760376,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6576433777809143,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.4288851022720337,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7525328993797302,
"adv/mean_abs_reasoning": 0.4725276231765747,
"adv/mean_abs_step_conf": 0.7634186148643494,
"adv/ratio_final_to_reasoning": 1.5925691165329456,
"adv/ratio_step_to_reasoning": 1.6156063210278697,
"adv/std_final_conf": 0.9256395697593689,
"adv/std_reasoning": 0.7393105626106262,
"adv/std_step_conf": 0.9346576929092407,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4485477770641547,
"calib/avg_num_step_conf": 5.68359375,
"calib/ece": 0.2087795275590551,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3700787401574803,
"calib/gap": 0.000744308856062248,
"calib/mean_conf": 0.8833464566929135,
"calib/mu_c": 0.8835838150289018,
"calib/mu_w": 0.8828395061728396,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.205511811023622,
"calib/std_conf": 0.0719102544298752,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.8022555555555555,
"calib/step_q_c_n": 900.0,
"calib/step_q_gap": 0.04533663663663656,
"calib/step_q_w": 0.7569189189189189,
"calib/step_q_w_n": 555.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1694.0,
"completions/max_terminated_length": 1694.0,
"completions/mean_length": 470.015625,
"completions/mean_terminated_length": 471.8588562011719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.0128,
"grad_norm": 0.03708720952272415,
"kl": 0.0020183920860290527,
"learning_rate": 3e-06,
"loss": 0.0311,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03654616326093674,
"mask/share_reasoning": 0.8310366868972778,
"mask/share_step_conf": 0.12851089239120483,
"num_tokens": 2811902.0,
"reward": 0.34594082832336426,
"reward_std": 0.18064436316490173,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7299535274505615,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.370103120803833,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7555368542671204,
"adv/mean_abs_reasoning": 0.5390787720680237,
"adv/mean_abs_step_conf": 0.7594738602638245,
"adv/ratio_final_to_reasoning": 1.4015333072172669,
"adv/ratio_step_to_reasoning": 1.4088365181777,
"adv/std_final_conf": 0.9309623837471008,
"adv/std_reasoning": 0.7927247881889343,
"adv/std_step_conf": 0.9354305267333984,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.582262534643487,
"calib/avg_num_step_conf": 4.875,
"calib/ece": 0.3088235294117647,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3607843137254902,
"calib/gap": 0.01705404383975806,
"calib/mean_conf": 0.8852941176470588,
"calib/mu_c": 0.8925170068027212,
"calib/mu_w": 0.8754629629629631,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3088235294117647,
"calib/std_conf": 0.04986070562474947,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7898414985590779,
"calib/step_q_c_n": 694.0,
"calib/step_q_gap": -0.0033714978308138077,
"calib/step_q_w": 0.7932129963898917,
"calib/step_q_w_n": 554.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2430.0,
"completions/max_terminated_length": 2430.0,
"completions/mean_length": 478.98828125,
"completions/mean_terminated_length": 478.98828125,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.03337820991873741,
"kl": 0.0012568831443786621,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0371,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03469127416610718,
"mask/share_reasoning": 0.8514494895935059,
"mask/share_step_conf": 0.11385929584503174,
"num_tokens": 3039115.0,
"reward": 0.2829028069972992,
"reward_std": 0.19942107796669006,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6625644564628601,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.41004008054733276,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7825286388397217,
"adv/mean_abs_reasoning": 0.5312036275863647,
"adv/mean_abs_step_conf": 0.7674595713615417,
"adv/ratio_final_to_reasoning": 1.4731236727341357,
"adv/ratio_step_to_reasoning": 1.4447558930436064,
"adv/std_final_conf": 0.9330103993415833,
"adv/std_reasoning": 0.7576258182525635,
"adv/std_step_conf": 0.9351935982704163,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.42071611253196933,
"calib/avg_num_step_conf": 5.21875,
"calib/ece": 0.36322709163346617,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.42231075697211157,
"calib/gap": -0.013297314578005093,
"calib/mean_conf": 0.8955776892430279,
"calib/mu_c": 0.8894852941176471,
"calib/mu_w": 0.9027826086956522,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.35848605577689246,
"calib/std_conf": 0.045655527938178404,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7928291316526611,
"calib/step_q_c_n": 714.0,
"calib/step_q_gap": 0.019436848694461606,
"calib/step_q_w": 0.7733922829581995,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2482.0,
"completions/max_terminated_length": 2482.0,
"completions/mean_length": 515.640625,
"completions/mean_terminated_length": 523.825439453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.04907077178359032,
"kl": 0.005765676498413086,
"learning_rate": 3.5e-06,
"loss": 0.0194,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03275441378355026,
"mask/share_reasoning": 0.8358793258666992,
"mask/share_step_conf": 0.1157413199543953,
"num_tokens": 3276519.0,
"reward": 0.2413352131843567,
"reward_std": 0.19809089601039886,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6020089983940125,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.4209010601043701,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7614812850952148,
"adv/mean_abs_reasoning": 0.4563322365283966,
"adv/mean_abs_step_conf": 0.77045738697052,
"adv/ratio_final_to_reasoning": 1.6686993031399162,
"adv/ratio_step_to_reasoning": 1.6883694056590632,
"adv/std_final_conf": 0.927528977394104,
"adv/std_reasoning": 0.7205897569656372,
"adv/std_step_conf": 0.9347683787345886,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.48810442386831276,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.33043650793650803,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5198412698412699,
"calib/gap": -0.0006249999999999867,
"calib/mean_conf": 0.9018650793650793,
"calib/mu_c": 0.9015972222222222,
"calib/mu_w": 0.9022222222222221,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33043650793650803,
"calib/std_conf": 0.04355362561668983,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7819583333333333,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": -0.0040347582037997265,
"calib/step_q_w": 0.785993091537133,
"calib/step_q_w_n": 579.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1891.0,
"completions/max_terminated_length": 1891.0,
"completions/mean_length": 455.0,
"completions/mean_terminated_length": 456.7843322753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.016,
"grad_norm": 0.04268636927008629,
"kl": 0.001432657241821289,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0248,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035845156759023666,
"mask/share_reasoning": 0.8433025479316711,
"mask/share_step_conf": 0.11694604158401489,
"num_tokens": 3500879.0,
"reward": 0.2575795650482178,
"reward_std": 0.18040362000465393,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6336527466773987,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.42786863446235657,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7424015998840332,
"adv/mean_abs_reasoning": 0.4367170035839081,
"adv/mean_abs_step_conf": 0.7670466899871826,
"adv/ratio_final_to_reasoning": 1.6999603720293268,
"adv/ratio_step_to_reasoning": 1.7563930043768197,
"adv/std_final_conf": 0.9280498623847961,
"adv/std_reasoning": 0.7014251351356506,
"adv/std_step_conf": 0.935114324092865,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4426333333333333,
"calib/avg_num_step_conf": 6.36328125,
"calib/ece": 0.31379999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.676,
"calib/gap": -0.0008333333333332416,
"calib/mean_conf": 0.9138000000000001,
"calib/mu_c": 0.9134666666666669,
"calib/mu_w": 0.9143000000000001,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31379999999999997,
"calib/std_conf": 0.048616458118624806,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.780503355704698,
"calib/step_q_c_n": 894.0,
"calib/step_q_gap": 0.041033967949595906,
"calib/step_q_w": 0.7394693877551021,
"calib/step_q_w_n": 735.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3040.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 652.76171875,
"completions/mean_terminated_length": 652.76171875,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.05110244080424309,
"kl": 0.0014814138412475586,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0212,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.026477422565221786,
"mask/share_reasoning": 0.8645951747894287,
"mask/share_step_conf": 0.10892736911773682,
"num_tokens": 3776834.0,
"reward": 0.2704630196094513,
"reward_std": 0.17843686044216156,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6428714990615845,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.41366422176361084,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7551780939102173,
"adv/mean_abs_reasoning": 0.42120128870010376,
"adv/mean_abs_step_conf": 0.7566851377487183,
"adv/ratio_final_to_reasoning": 1.7929149652908725,
"adv/ratio_step_to_reasoning": 1.7964929311683082,
"adv/std_final_conf": 0.9255588054656982,
"adv/std_reasoning": 0.7013599872589111,
"adv/std_step_conf": 0.9355192184448242,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5302579365079365,
"calib/avg_num_step_conf": 5.328125,
"calib/ece": 0.16675889328063234,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6442687747035574,
"calib/gap": 0.009890046296296418,
"calib/mean_conf": 0.9137944664031621,
"calib/mu_c": 0.9162962962962964,
"calib/mu_w": 0.90640625,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.16675889328063234,
"calib/std_conf": 0.04511545629002079,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.7666471734892788,
"calib/step_q_c_n": 1026.0,
"calib/step_q_gap": 0.026173800708213735,
"calib/step_q_w": 0.7404733727810651,
"calib/step_q_w_n": 338.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2091.0,
"completions/max_terminated_length": 2091.0,
"completions/mean_length": 518.14453125,
"completions/mean_terminated_length": 518.14453125,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.05879347026348114,
"kl": 0.002603292465209961,
"learning_rate": 4.25e-06,
"loss": 0.0993,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03295683488249779,
"mask/share_reasoning": 0.8492019176483154,
"mask/share_step_conf": 0.11784122884273529,
"num_tokens": 4013007.0,
"reward": 0.37962839007377625,
"reward_std": 0.1868617981672287,
"rewards/accuracy_reward_step": 0.73828125,
"rewards/final_brier_reward_step": 0.764146089553833,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3478580713272095,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7532843947410583,
"adv/mean_abs_reasoning": 0.35541170835494995,
"adv/mean_abs_step_conf": 0.7603262066841125,
"adv/ratio_final_to_reasoning": 2.1194698346537097,
"adv/ratio_step_to_reasoning": 2.1392829465392125,
"adv/std_final_conf": 0.9244809746742249,
"adv/std_reasoning": 0.6612080335617065,
"adv/std_step_conf": 0.9350612163543701,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.4683257918552037,
"calib/avg_num_step_conf": 5.41796875,
"calib/ece": 0.39871485943775115,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.678714859437751,
"calib/gap": -0.002163542340013147,
"calib/mean_conf": 0.9208032128514058,
"calib/mu_c": 0.9197692307692308,
"calib/mu_w": 0.9219327731092439,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.39871485943775115,
"calib/std_conf": 0.04024220633117022,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.74281098546042,
"calib/step_q_c_n": 619.0,
"calib/step_q_gap": 0.10115733962708673,
"calib/step_q_w": 0.6416536458333333,
"calib/step_q_w_n": 768.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2919.0,
"completions/max_terminated_length": 2919.0,
"completions/mean_length": 541.265625,
"completions/mean_terminated_length": 541.265625,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.0192,
"grad_norm": 0.03853190690279007,
"kl": 0.0032968521118164062,
"learning_rate": 4.5e-06,
"loss": 0.0616,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0334947407245636,
"mask/share_reasoning": 0.8596949577331543,
"mask/share_step_conf": 0.10681027919054031,
"num_tokens": 4262291.0,
"reward": 0.21546542644500732,
"reward_std": 0.14310386776924133,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5657578706741333,
"rewards/format_reward_step": 0.953125,
"rewards/step_l1_reward": -0.42701447010040283,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.757287859916687,
"adv/mean_abs_reasoning": 0.37841731309890747,
"adv/mean_abs_step_conf": 0.7881565093994141,
"adv/ratio_final_to_reasoning": 2.001197708728389,
"adv/ratio_step_to_reasoning": 2.0827707457280438,
"adv/std_final_conf": 0.9230532646179199,
"adv/std_reasoning": 0.6611937284469604,
"adv/std_step_conf": 0.9348154664039612,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.536557855626327,
"calib/avg_num_step_conf": 4.39453125,
"calib/ece": 0.29727272727272724,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7865612648221344,
"calib/gap": 0.026543922505307904,
"calib/mean_conf": 0.9178260869565217,
"calib/mu_c": 0.9278980891719746,
"calib/mu_w": 0.9013541666666667,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.29727272727272724,
"calib/std_conf": 0.0817376587202468,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7642815249266862,
"calib/step_q_c_n": 682.0,
"calib/step_q_gap": 0.02811899670998197,
"calib/step_q_w": 0.7361625282167042,
"calib/step_q_w_n": 443.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2572.0,
"completions/max_terminated_length": 2572.0,
"completions/mean_length": 507.07421875,
"completions/mean_terminated_length": 507.07421875,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.03658737987279892,
"kl": 0.004879474639892578,
"learning_rate": 4.75e-06,
"loss": 0.052,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032567959278821945,
"mask/share_reasoning": 0.8663681745529175,
"mask/share_step_conf": 0.10106386244297028,
"num_tokens": 4496862.0,
"reward": 0.3029266595840454,
"reward_std": 0.17736773192882538,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.6701222658157349,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.3838001787662506,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.735227108001709,
"adv/mean_abs_reasoning": 0.443066269159317,
"adv/mean_abs_step_conf": 0.7690048813819885,
"adv/ratio_final_to_reasoning": 1.6594066377400922,
"adv/ratio_step_to_reasoning": 1.7356430288433242,
"adv/std_final_conf": 0.9224920272827148,
"adv/std_reasoning": 0.7391899824142456,
"adv/std_step_conf": 0.9353877305984497,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.47368758002560823,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.36718253968253967,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.8333333333333334,
"calib/gap": 0.0013585147247117124,
"calib/mean_conf": 0.930674603174603,
"calib/mu_c": 0.9312676056338027,
"calib/mu_w": 0.929909090909091,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.36718253968253967,
"calib/std_conf": 0.040147808132678944,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7546852122986824,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.047981390642631294,
"calib/step_q_w": 0.7067038216560511,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2121.0,
"completions/max_terminated_length": 2121.0,
"completions/mean_length": 469.92578125,
"completions/mean_terminated_length": 471.7686462402344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.0296552162617445,
"kl": 0.007416725158691406,
"learning_rate": 5e-06,
"loss": 0.0436,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.037708643823862076,
"mask/share_reasoning": 0.8331930637359619,
"mask/share_step_conf": 0.1251920461654663,
"num_tokens": 4722035.0,
"reward": 0.2628268599510193,
"reward_std": 0.190019890666008,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6044167876243591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.38501307368278503,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7824727892875671,
"adv/mean_abs_reasoning": 0.5332362651824951,
"adv/mean_abs_step_conf": 0.7937630414962769,
"adv/ratio_final_to_reasoning": 1.4674035514440735,
"adv/ratio_step_to_reasoning": 1.4885766278942392,
"adv/std_final_conf": 0.9178428053855896,
"adv/std_reasoning": 0.7575809359550476,
"adv/std_step_conf": 0.9357240200042725,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5421815216709528,
"calib/avg_num_step_conf": 4.96875,
"calib/ece": 0.36324218750000004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.9140625,
"calib/gap": 0.007936398419368884,
"calib/mean_conf": 0.9452734375,
"calib/mu_c": 0.9485906040268455,
"calib/mu_w": 0.9406542056074766,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36324218750000004,
"calib/std_conf": 0.03473003357230731,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7154131054131054,
"calib/step_q_c_n": 702.0,
"calib/step_q_gap": 0.021465736992052897,
"calib/step_q_w": 0.6939473684210525,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1699.0,
"completions/max_terminated_length": 1699.0,
"completions/mean_length": 470.1484375,
"completions/mean_terminated_length": 471.9921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.0224,
"grad_norm": 0.028702791780233383,
"kl": 0.010184288024902344,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0438,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035543717443943024,
"mask/share_reasoning": 0.846480131149292,
"mask/share_step_conf": 0.11406994611024857,
"num_tokens": 4945353.0,
"reward": 0.2834588885307312,
"reward_std": 0.2273433953523636,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6274394989013672,
"rewards/format_reward_step": 1.0,
"rewards/step_l1_reward": -0.37692791223526,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.7335162162780762,
"adv/mean_abs_reasoning": 0.4049103260040283,
"adv/mean_abs_step_conf": 0.7814478278160095,
"adv/ratio_final_to_reasoning": 1.8115522602670762,
"adv/ratio_step_to_reasoning": 1.9299281288475592,
"adv/std_final_conf": 0.9055657386779785,
"adv/std_reasoning": 0.6815546154975891,
"adv/std_step_conf": 0.9354314804077148,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4629916819399408,
"calib/avg_num_step_conf": 5.1640625,
"calib/ece": 0.26784313725490183,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9372549019607843,
"calib/gap": -0.002974763851684603,
"calib/mean_conf": 0.9462745098039216,
"calib/mu_c": 0.9453179190751445,
"calib/mu_w": 0.9482926829268291,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.26784313725490183,
"calib/std_conf": 0.023192357117836803,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.715356738391846,
"calib/step_q_c_n": 883.0,
"calib/step_q_gap": 0.014670178027381242,
"calib/step_q_w": 0.7006865603644647,
"calib/step_q_w_n": 439.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1195.0,
"completions/max_terminated_length": 1195.0,
"completions/mean_length": 445.1640625,
"completions/mean_terminated_length": 446.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.03779933229088783,
"kl": 0.012391090393066406,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0277,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0352453738451004,
"mask/share_reasoning": 0.8380044102668762,
"mask/share_step_conf": 0.12284398078918457,
"num_tokens": 5161131.0,
"reward": 0.3495637774467468,
"reward_std": 0.18510903418064117,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.705495297908783,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l1_reward": -0.3407427668571472,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7578845024108887,
"adv/mean_abs_reasoning": 0.5459873676300049,
"adv/mean_abs_step_conf": 0.7861717939376831,
"adv/ratio_final_to_reasoning": 1.3880989695799675,
"adv/ratio_step_to_reasoning": 1.4399083944931894,
"adv/std_final_conf": 0.9229386448860168,
"adv/std_reasoning": 0.79274982213974,
"adv/std_step_conf": 0.9354714155197144,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.539328626285148,
"calib/avg_num_step_conf": 4.9140625,
"calib/ece": 0.418,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9686274509803922,
"calib/gap": 0.0005611296915644104,
"calib/mean_conf": 0.954235294117647,
"calib/mu_c": 0.9544927536231882,
"calib/mu_w": 0.9539316239316238,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.41552941176470587,
"calib/std_conf": 0.025593035384799748,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7103669724770642,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.02596299896713039,
"calib/step_q_w": 0.6844039735099338,
"calib/step_q_w_n": 604.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2725.0,
"completions/max_terminated_length": 2725.0,
"completions/mean_length": 496.51171875,
"completions/mean_terminated_length": 496.51171875,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.02881755866110325,
"kl": 0.014128684997558594,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0437,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03612464666366577,
"mask/share_reasoning": 0.8429062366485596,
"mask/share_step_conf": 0.12096910178661346,
"num_tokens": 5392174.0,
"reward": 0.23914536833763123,
"reward_std": 0.21841177344322205,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5744590163230896,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.4016370177268982,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7407926321029663,
"adv/mean_abs_reasoning": 0.5218784809112549,
"adv/mean_abs_step_conf": 0.7790415287017822,
"adv/ratio_final_to_reasoning": 1.4194734199606471,
"adv/ratio_step_to_reasoning": 1.4927642299822241,
"adv/std_final_conf": 0.9203848838806152,
"adv/std_reasoning": 0.7753624320030212,
"adv/std_step_conf": 0.9355775117874146,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4989289314516129,
"calib/avg_num_step_conf": 5.484375,
"calib/ece": 0.4689285714285714,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9603174603174603,
"calib/gap": -0.007467237903225898,
"calib/mean_conf": 0.9523412698412698,
"calib/mu_c": 0.9485483870967741,
"calib/mu_w": 0.956015625,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.4646031746031746,
"calib/std_conf": 0.05403256847060262,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6813832853025936,
"calib/step_q_c_n": 694.0,
"calib/step_q_gap": 0.04469314445752326,
"calib/step_q_w": 0.6366901408450704,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2591.0,
"completions/max_terminated_length": 2591.0,
"completions/mean_length": 548.1875,
"completions/mean_terminated_length": 548.1875,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.0256,
"grad_norm": 0.04038697108626366,
"kl": 0.019349098205566406,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0611,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03291749954223633,
"mask/share_reasoning": 0.8484556674957275,
"mask/share_step_conf": 0.11862681806087494,
"num_tokens": 5637022.0,
"reward": 0.21600434184074402,
"reward_std": 0.22113731503486633,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.5229433178901672,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.383903443813324,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.72950679063797,
"adv/mean_abs_reasoning": 0.40538179874420166,
"adv/mean_abs_step_conf": 0.7633631229400635,
"adv/ratio_final_to_reasoning": 1.7995548712296605,
"adv/ratio_step_to_reasoning": 1.8830720207587568,
"adv/std_final_conf": 0.90348219871521,
"adv/std_reasoning": 0.6816384196281433,
"adv/std_step_conf": 0.9356397390365601,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5344110115236875,
"calib/avg_num_step_conf": 5.32421875,
"calib/ece": 0.3967460317460318,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9801587301587301,
"calib/gap": 0.0036491677336746298,
"calib/mean_conf": 0.9602380952380952,
"calib/mu_c": 0.9618309859154929,
"calib/mu_w": 0.9581818181818182,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3967460317460318,
"calib/std_conf": 0.02092937048152271,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6436760563380282,
"calib/step_q_c_n": 710.0,
"calib/step_q_gap": -0.0035827491749886864,
"calib/step_q_w": 0.6472588055130168,
"calib/step_q_w_n": 653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2419.0,
"completions/max_terminated_length": 2419.0,
"completions/mean_length": 481.6328125,
"completions/mean_terminated_length": 483.5216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.028889209032058716,
"kl": 0.019657135009765625,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0584,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03381389379501343,
"mask/share_reasoning": 0.8391128182411194,
"mask/share_step_conf": 0.1231670081615448,
"num_tokens": 5863544.0,
"reward": 0.2530894875526428,
"reward_std": 0.18848128616809845,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.5886375308036804,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.39027103781700134,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7045278549194336,
"adv/mean_abs_reasoning": 0.38574790954589844,
"adv/mean_abs_step_conf": 0.7433183193206787,
"adv/ratio_final_to_reasoning": 1.82639448584129,
"adv/ratio_step_to_reasoning": 1.9269535904827362,
"adv/std_final_conf": 0.8941695690155029,
"adv/std_reasoning": 0.681533932685852,
"adv/std_step_conf": 0.935562252998352,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5127207737594617,
"calib/avg_num_step_conf": 4.66796875,
"calib/ece": 0.3098406374501991,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9920318725099602,
"calib/gap": 7.91982057750662e-05,
"calib/mean_conf": 0.9623505976095617,
"calib/mu_c": 0.9623780487804877,
"calib/mu_w": 0.9622988505747126,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3094023904382469,
"calib/std_conf": 0.018175017595124958,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6774024226110363,
"calib/step_q_c_n": 743.0,
"calib/step_q_gap": 0.036849325265903565,
"calib/step_q_w": 0.6405530973451328,
"calib/step_q_w_n": 452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2450.0,
"completions/max_terminated_length": 2450.0,
"completions/mean_length": 480.02734375,
"completions/mean_terminated_length": 483.8070983886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.19219711422920227,
"kl": 0.1522216796875,
"learning_rate": 4.833333333333333e-06,
"loss": -0.0185,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03214826434850693,
"mask/share_reasoning": 0.8559674024581909,
"mask/share_step_conf": 0.10407190024852753,
"num_tokens": 6091671.0,
"reward": 0.32004281878471375,
"reward_std": 0.1753888726234436,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6643816232681274,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3477335572242737,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.768155574798584,
"adv/mean_abs_reasoning": 0.5936883687973022,
"adv/mean_abs_step_conf": 0.7633221745491028,
"adv/ratio_final_to_reasoning": 1.2938700085277375,
"adv/ratio_step_to_reasoning": 1.285728699882475,
"adv/std_final_conf": 0.9210662245750427,
"adv/std_reasoning": 0.8098739385604858,
"adv/std_step_conf": 0.9356999397277832,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4615677321156773,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.3874409448818897,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9645669291338582,
"calib/gap": -0.0015956367326230847,
"calib/mean_conf": 0.9538976377952756,
"calib/mu_c": 0.9532191780821919,
"calib/mu_w": 0.954814814814815,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.383267716535433,
"calib/std_conf": 0.07715515961902557,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6421166892808684,
"calib/step_q_c_n": 737.0,
"calib/step_q_gap": 0.005686879922116139,
"calib/step_q_w": 0.6364298093587523,
"calib/step_q_w_n": 577.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2831.0,
"completions/max_terminated_length": 2831.0,
"completions/mean_length": 475.65625,
"completions/mean_terminated_length": 477.5216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.0288,
"grad_norm": 0.03019108809530735,
"kl": 0.025396347045898438,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0089,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034386180341243744,
"mask/share_reasoning": 0.8439096212387085,
"mask/share_step_conf": 0.11779787391424179,
"num_tokens": 6318655.0,
"reward": 0.26763755083084106,
"reward_std": 0.2534090578556061,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6004222631454468,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l1_reward": -0.37764716148376465,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7378636002540588,
"adv/mean_abs_reasoning": 0.3979493975639343,
"adv/mean_abs_step_conf": 0.7608801126480103,
"adv/ratio_final_to_reasoning": 1.8541643856503491,
"adv/ratio_step_to_reasoning": 1.9120021724012477,
"adv/std_final_conf": 0.8981975317001343,
"adv/std_reasoning": 0.6817038059234619,
"adv/std_step_conf": 0.9358172416687012,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5226771021473008,
"calib/avg_num_step_conf": 4.57421875,
"calib/ece": 0.3626000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.988,
"calib/gap": 0.0027326242558030067,
"calib/mean_conf": 0.9665999999999999,
"calib/mu_c": 0.967682119205298,
"calib/mu_w": 0.964949494949495,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3626000000000001,
"calib/std_conf": 0.01988064385275286,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.6198295454545455,
"calib/step_q_c_n": 704.0,
"calib/step_q_gap": -0.018564458828109753,
"calib/step_q_w": 0.6383940042826552,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2692.0,
"completions/max_terminated_length": 2692.0,
"completions/mean_length": 526.57421875,
"completions/mean_terminated_length": 528.6392211914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.028602536767721176,
"kl": 0.024019241333007812,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0132,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03136756643652916,
"mask/share_reasoning": 0.8636547923088074,
"mask/share_step_conf": 0.10107140243053436,
"num_tokens": 6560402.0,
"reward": 0.2832421660423279,
"reward_std": 0.17565187811851501,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6112655997276306,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3565000593662262,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7392631769180298,
"adv/mean_abs_reasoning": 0.4967987537384033,
"adv/mean_abs_step_conf": 0.7861140966415405,
"adv/ratio_final_to_reasoning": 1.4880536059220866,
"adv/ratio_step_to_reasoning": 1.5823592364635448,
"adv/std_final_conf": 0.9159027934074402,
"adv/std_reasoning": 0.7575135827064514,
"adv/std_step_conf": 0.935678243637085,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5181607901975493,
"calib/avg_num_step_conf": 5.8515625,
"calib/ece": 0.4582608695652173,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.00046886721680405863,
"calib/mean_conf": 0.9681422924901185,
"calib/mu_c": 0.9683720930232558,
"calib/mu_w": 0.9679032258064517,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4582608695652173,
"calib/std_conf": 0.017105448068898105,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5549710982658959,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.01014479553636738,
"calib/step_q_w": 0.5448263027295285,
"calib/step_q_w_n": 806.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2333.0,
"completions/max_terminated_length": 2333.0,
"completions/mean_length": 551.26953125,
"completions/mean_terminated_length": 555.6102294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.028186749666929245,
"kl": 0.02429962158203125,
"learning_rate": 4.75e-06,
"loss": -0.0722,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.02976279705762863,
"mask/share_reasoning": 0.8515908122062683,
"mask/share_step_conf": 0.11083388328552246,
"num_tokens": 6808655.0,
"reward": 0.23735710978507996,
"reward_std": 0.20843389630317688,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5337077975273132,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.3574310839176178,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7558501958847046,
"adv/mean_abs_reasoning": 0.50095134973526,
"adv/mean_abs_step_conf": 0.7730767726898193,
"adv/ratio_final_to_reasoning": 1.5088295425976037,
"adv/ratio_step_to_reasoning": 1.5432172667033848,
"adv/std_final_conf": 0.9127808809280396,
"adv/std_reasoning": 0.7576369643211365,
"adv/std_step_conf": 0.9355639219284058,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4106060606060606,
"calib/avg_num_step_conf": 5.23828125,
"calib/ece": 0.39569721115537865,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9760956175298805,
"calib/gap": 0.012989039329464935,
"calib/mean_conf": 0.9540239043824702,
"calib/mu_c": 0.9597163120567376,
"calib/mu_w": 0.9467272727272726,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3939840637450201,
"calib/std_conf": 0.09227181085240978,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5691840455840456,
"calib/step_q_c_n": 702.0,
"calib/step_q_gap": 0.0016879579471128503,
"calib/step_q_w": 0.5674960876369327,
"calib/step_q_w_n": 639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2748.0,
"completions/max_terminated_length": 2748.0,
"completions/mean_length": 574.0546875,
"completions/mean_terminated_length": 576.305908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.032,
"grad_norm": 0.025844834744930267,
"kl": 0.025775909423828125,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0335,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.029186809435486794,
"mask/share_reasoning": 0.8634512424468994,
"mask/share_step_conf": 0.10345575958490372,
"num_tokens": 7062597.0,
"reward": 0.2614104151725769,
"reward_std": 0.21709409356117249,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5859875082969666,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.368635356426239,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7692661881446838,
"adv/mean_abs_reasoning": 0.5111313462257385,
"adv/mean_abs_step_conf": 0.7831858396530151,
"adv/ratio_final_to_reasoning": 1.5050264356217773,
"adv/ratio_step_to_reasoning": 1.5322594582315543,
"adv/std_final_conf": 0.9017819762229919,
"adv/std_reasoning": 0.7577013373374939,
"adv/std_step_conf": 0.9357402324676514,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5134594882729211,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.506910569105691,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9878048780487805,
"calib/gap": 0.009518923240938282,
"calib/mean_conf": 0.9609756097560977,
"calib/mu_c": 0.9661607142857144,
"calib/mu_w": 0.9566417910447761,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.50630081300813,
"calib/std_conf": 0.0653248870533324,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.605117540687161,
"calib/step_q_c_n": 553.0,
"calib/step_q_gap": 0.08459658938478276,
"calib/step_q_w": 0.5205209513023782,
"calib/step_q_w_n": 883.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2875.0,
"completions/max_terminated_length": 2875.0,
"completions/mean_length": 617.109375,
"completions/mean_terminated_length": 621.968505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.023726139217615128,
"kl": 0.0217742919921875,
"learning_rate": 4.694444444444445e-06,
"loss": -0.017,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.029293062165379524,
"mask/share_reasoning": 0.8607058525085449,
"mask/share_step_conf": 0.1021885871887207,
"num_tokens": 7326489.0,
"reward": 0.19026575982570648,
"reward_std": 0.21116501092910767,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.47342658042907715,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.37180131673812866,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7366917133331299,
"adv/mean_abs_reasoning": 0.40366196632385254,
"adv/mean_abs_step_conf": 0.7732025384902954,
"adv/ratio_final_to_reasoning": 1.8250213663729025,
"adv/ratio_step_to_reasoning": 1.915470376195823,
"adv/std_final_conf": 0.8951123952865601,
"adv/std_reasoning": 0.6816351413726807,
"adv/std_step_conf": 0.9357044696807861,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5704663544807236,
"calib/avg_num_step_conf": 4.9140625,
"calib/ece": 0.43972000000000006,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.988,
"calib/gap": 0.01046699595868883,
"calib/mean_conf": 0.96372,
"calib/mu_c": 0.9687022900763358,
"calib/mu_w": 0.958235294117647,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.43972000000000006,
"calib/std_conf": 0.03922450254624015,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5661003262642741,
"calib/step_q_c_n": 613.0,
"calib/step_q_gap": 0.016999551070475682,
"calib/step_q_w": 0.5491007751937984,
"calib/step_q_w_n": 645.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2225.0,
"completions/max_terminated_length": 2225.0,
"completions/mean_length": 513.23046875,
"completions/mean_terminated_length": 519.3162231445312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.029085859656333923,
"kl": 0.029659271240234375,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0095,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03149807080626488,
"mask/share_reasoning": 0.8505294919013977,
"mask/share_step_conf": 0.1062537431716919,
"num_tokens": 7564580.0,
"reward": 0.23524346947669983,
"reward_std": 0.17312288284301758,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.547758936882019,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.3749282956123352,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.741101861000061,
"adv/mean_abs_reasoning": 0.49518057703971863,
"adv/mean_abs_step_conf": 0.7677923440933228,
"adv/ratio_final_to_reasoning": 1.4966295031814565,
"adv/ratio_step_to_reasoning": 1.5505300080292483,
"adv/std_final_conf": 0.8996115326881409,
"adv/std_reasoning": 0.7576470971107483,
"adv/std_step_conf": 0.9357903599739075,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5124449053668654,
"calib/avg_num_step_conf": 5.27734375,
"calib/ece": 0.42674698795180727,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9759036144578314,
"calib/gap": 0.010530852994555251,
"calib/mean_conf": 0.9608835341365461,
"calib/mu_c": 0.9657894736842104,
"calib/mu_w": 0.9552586206896552,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.42674698795180727,
"calib/std_conf": 0.05386304298457405,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5570839260312944,
"calib/step_q_c_n": 703.0,
"calib/step_q_gap": 0.03455306183376361,
"calib/step_q_w": 0.5225308641975308,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2667.0,
"completions/max_terminated_length": 2667.0,
"completions/mean_length": 533.4453125,
"completions/mean_terminated_length": 539.770751953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.0352,
"grad_norm": 0.023499513044953346,
"kl": 0.03248023986816406,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0092,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.030111689120531082,
"mask/share_reasoning": 0.8539035320281982,
"mask/share_step_conf": 0.10426604002714157,
"num_tokens": 7808014.0,
"reward": 0.24479824304580688,
"reward_std": 0.20775847136974335,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5557679533958435,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.3646090030670166,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7225061655044556,
"adv/mean_abs_reasoning": 0.5000270009040833,
"adv/mean_abs_step_conf": 0.7667113542556763,
"adv/ratio_final_to_reasoning": 1.4449343019439242,
"adv/ratio_step_to_reasoning": 1.533339905383928,
"adv/std_final_conf": 0.9046749472618103,
"adv/std_reasoning": 0.7575814723968506,
"adv/std_step_conf": 0.9356700778007507,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.47993767043241137,
"calib/avg_num_step_conf": 5.6953125,
"calib/ece": 0.36723320158102757,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9723320158102767,
"calib/gap": 0.0009076743280094224,
"calib/mean_conf": 0.9640711462450593,
"calib/mu_c": 0.9644370860927152,
"calib/mu_w": 0.9635294117647057,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36723320158102757,
"calib/std_conf": 0.02644307177563116,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5078490566037736,
"calib/step_q_c_n": 795.0,
"calib/step_q_gap": 0.025209539258373825,
"calib/step_q_w": 0.4826395173453997,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2824.0,
"completions/max_terminated_length": 2824.0,
"completions/mean_length": 489.72265625,
"completions/mean_terminated_length": 491.6431579589844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.024688169360160828,
"kl": 0.03778839111328125,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0327,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03325295448303223,
"mask/share_reasoning": 0.837381899356842,
"mask/share_step_conf": 0.12545892596244812,
"num_tokens": 8038495.0,
"reward": 0.29707586765289307,
"reward_std": 0.21303704380989075,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6169394254684448,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.33841270208358765,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7423563003540039,
"adv/mean_abs_reasoning": 0.49117863178253174,
"adv/mean_abs_step_conf": 0.7686483860015869,
"adv/ratio_final_to_reasoning": 1.511377434437499,
"adv/ratio_step_to_reasoning": 1.564905996036701,
"adv/std_final_conf": 0.899803638458252,
"adv/std_reasoning": 0.7392747402191162,
"adv/std_step_conf": 0.9357601404190063,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5246336996336997,
"calib/avg_num_step_conf": 4.62890625,
"calib/ece": 0.4491015624999999,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.9453125,
"calib/gap": 0.016330891330891495,
"calib/mean_conf": 0.9450390625000001,
"calib/mu_c": 0.953076923076923,
"calib/mu_w": 0.9367460317460315,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4431640624999999,
"calib/std_conf": 0.12427287907713852,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5168055555555555,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.0242111384783798,
"calib/step_q_w": 0.4925944170771757,
"calib/step_q_w_n": 609.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1693.0,
"completions/max_terminated_length": 1693.0,
"completions/mean_length": 538.1015625,
"completions/mean_terminated_length": 540.2117919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.026421768590807915,
"kl": 0.03264617919921875,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0089,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.030006268993020058,
"mask/share_reasoning": 0.874493420124054,
"mask/share_step_conf": 0.09159402549266815,
"num_tokens": 8285505.0,
"reward": 0.24800075590610504,
"reward_std": 0.21784111857414246,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5516136884689331,
"rewards/format_reward_step": 1.0,
"rewards/step_l1_reward": -0.3571746349334717,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.6796886324882507,
"adv/mean_abs_reasoning": 0.3737061619758606,
"adv/mean_abs_step_conf": 0.7662829160690308,
"adv/ratio_final_to_reasoning": 1.8187782317920542,
"adv/ratio_step_to_reasoning": 2.0504958013470715,
"adv/std_final_conf": 0.8769128322601318,
"adv/std_reasoning": 0.6815247535705566,
"adv/std_step_conf": 0.9357625842094421,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5590420081967213,
"calib/avg_num_step_conf": 4.87109375,
"calib/ece": 0.20944664031620558,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": 0.0005729166666664787,
"calib/mean_conf": 0.9604347826086956,
"calib/mu_c": 0.9605729166666667,
"calib/mu_w": 0.9600000000000002,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2054940711462451,
"calib/std_conf": 0.06510715320812133,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.502905982905983,
"calib/step_q_c_n": 936.0,
"calib/step_q_gap": 0.017246818918844664,
"calib/step_q_w": 0.4856591639871383,
"calib/step_q_w_n": 311.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 489.90625,
"completions/mean_terminated_length": 491.8274841308594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0384,
"grad_norm": 0.03915192931890488,
"kl": 0.039752960205078125,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0076,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0351969450712204,
"mask/share_reasoning": 0.8441604375839233,
"mask/share_step_conf": 0.11673638969659805,
"num_tokens": 8513633.0,
"reward": 0.3843995928764343,
"reward_std": 0.17076198756694794,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7632484436035156,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.34132421016693115,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.7042989730834961,
"adv/mean_abs_reasoning": 0.38596227765083313,
"adv/mean_abs_step_conf": 0.7719683647155762,
"adv/ratio_final_to_reasoning": 1.8247870682342986,
"adv/ratio_step_to_reasoning": 2.000113506983575,
"adv/std_final_conf": 0.8864962458610535,
"adv/std_reasoning": 0.6613255143165588,
"adv/std_step_conf": 0.9358731508255005,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.585840108401084,
"calib/avg_num_step_conf": 5.44921875,
"calib/ece": 0.47028806584362165,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.9876543209876543,
"calib/gap": 0.013930894308943231,
"calib/mean_conf": 0.9641152263374487,
"calib/mu_c": 0.9711666666666667,
"calib/mu_w": 0.9572357723577235,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.47028806584362165,
"calib/std_conf": 0.06343671788141357,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5181692573402418,
"calib/step_q_c_n": 579.0,
"calib/step_q_gap": 0.1321275906735751,
"calib/step_q_w": 0.38604166666666667,
"calib/step_q_w_n": 816.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2893.0,
"completions/max_terminated_length": 2893.0,
"completions/mean_length": 582.39453125,
"completions/mean_terminated_length": 589.3004150390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.028868885710835457,
"kl": 0.0316619873046875,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0207,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.030005447566509247,
"mask/share_reasoning": 0.8539750576019287,
"mask/share_step_conf": 0.10430075228214264,
"num_tokens": 8769822.0,
"reward": 0.22407503426074982,
"reward_std": 0.1822788268327713,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.5044206976890564,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l1_reward": -0.33908313512802124,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.7056914567947388,
"adv/mean_abs_reasoning": 0.43036288022994995,
"adv/mean_abs_step_conf": 0.772477388381958,
"adv/ratio_final_to_reasoning": 1.6397591177419304,
"adv/ratio_step_to_reasoning": 1.7949442757916543,
"adv/std_final_conf": 0.8990572094917297,
"adv/std_reasoning": 0.7206587195396423,
"adv/std_step_conf": 0.9357432126998901,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5018480628341364,
"calib/avg_num_step_conf": 4.80078125,
"calib/ece": 0.40133064516129036,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9838709677419355,
"calib/gap": 0.008475348161837482,
"calib/mean_conf": 0.9618145161290323,
"calib/mu_c": 0.9655395683453236,
"calib/mu_w": 0.9570642201834861,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.40133064516129036,
"calib/std_conf": 0.052505125239149485,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4564380566801619,
"calib/step_q_c_n": 741.0,
"calib/step_q_gap": -0.006372189221477476,
"calib/step_q_w": 0.4628102459016394,
"calib/step_q_w_n": 488.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2779.0,
"completions/max_terminated_length": 2779.0,
"completions/mean_length": 543.45703125,
"completions/mean_terminated_length": 543.45703125,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.026996370404958725,
"kl": 0.0350799560546875,
"learning_rate": 4.5e-06,
"loss": -0.0046,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03184231370687485,
"mask/share_reasoning": 0.8610826134681702,
"mask/share_step_conf": 0.10707508027553558,
"num_tokens": 9015835.0,
"reward": 0.26468414068222046,
"reward_std": 0.19265443086624146,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5750671625137329,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.3472614288330078,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7193535566329956,
"adv/mean_abs_reasoning": 0.38643133640289307,
"adv/mean_abs_step_conf": 0.7790340185165405,
"adv/ratio_final_to_reasoning": 1.861530080166682,
"adv/ratio_step_to_reasoning": 2.01597009644249,
"adv/std_final_conf": 0.898597776889801,
"adv/std_reasoning": 0.6612383723258972,
"adv/std_step_conf": 0.9355990886688232,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5419240127269324,
"calib/avg_num_step_conf": 4.91015625,
"calib/ece": 0.43409448818897645,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9763779527559056,
"calib/gap": -0.0058493979661859985,
"calib/mean_conf": 0.9629133858267716,
"calib/mu_c": 0.9602189781021898,
"calib/mu_w": 0.9660683760683758,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.42881889763779535,
"calib/std_conf": 0.0665944387290601,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48212598425196845,
"calib/step_q_c_n": 635.0,
"calib/step_q_gap": 0.022833379750360816,
"calib/step_q_w": 0.45929260450160764,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1228.0,
"completions/max_terminated_length": 1228.0,
"completions/mean_length": 475.38671875,
"completions/mean_terminated_length": 477.2510070800781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0416,
"grad_norm": 0.024599267169833183,
"kl": 0.037715911865234375,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0049,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03345062956213951,
"mask/share_reasoning": 0.8500820994377136,
"mask/share_step_conf": 0.11256100237369537,
"num_tokens": 9243622.0,
"reward": 0.25512662529945374,
"reward_std": 0.17816153168678284,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5565023422241211,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.3509365916252136,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7258785367012024,
"adv/mean_abs_reasoning": 0.440116286277771,
"adv/mean_abs_step_conf": 0.7749820947647095,
"adv/ratio_final_to_reasoning": 1.6492880616625898,
"adv/ratio_step_to_reasoning": 1.7608575708911491,
"adv/std_final_conf": 0.9175970554351807,
"adv/std_reasoning": 0.7205471396446228,
"adv/std_step_conf": 0.9358422756195068,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4996827813729222,
"calib/avg_num_step_conf": 4.66796875,
"calib/ece": 0.5257308300395259,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9525691699604744,
"calib/gap": -0.003204612358837622,
"calib/mean_conf": 0.9487355731225297,
"calib/mu_c": 0.9469369369369369,
"calib/mu_w": 0.9501415492957745,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.5178656126482216,
"calib/std_conf": 0.12410322618837606,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4709760956175299,
"calib/step_q_c_n": 502.0,
"calib/step_q_gap": 0.05189961654105091,
"calib/step_q_w": 0.419076479076479,
"calib/step_q_w_n": 693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 545.328125,
"completions/mean_terminated_length": 545.328125,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.03541141003370285,
"kl": 0.043437957763671875,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0105,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03277438133955002,
"mask/share_reasoning": 0.8654713034629822,
"mask/share_step_conf": 0.1017543375492096,
"num_tokens": 9489986.0,
"reward": 0.19916599988937378,
"reward_std": 0.19551372528076172,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.4707808494567871,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.3560425937175751,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.6797932386398315,
"adv/mean_abs_reasoning": 0.3944539725780487,
"adv/mean_abs_step_conf": 0.7641567587852478,
"adv/ratio_final_to_reasoning": 1.7233778486166067,
"adv/ratio_step_to_reasoning": 1.9372520291554367,
"adv/std_final_conf": 0.8657556772232056,
"adv/std_reasoning": 0.6815750002861023,
"adv/std_step_conf": 0.9356968998908997,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6030844155844156,
"calib/avg_num_step_conf": 4.796875,
"calib/ece": 0.1837007874015749,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9488188976377953,
"calib/gap": 0.05154401154401156,
"calib/mean_conf": 0.9473228346456694,
"calib/mu_c": 0.9586868686868687,
"calib/mu_w": 0.9071428571428571,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17574803149606308,
"calib/std_conf": 0.127148922660387,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4666802030456853,
"calib/step_q_c_n": 985.0,
"calib/step_q_gap": 0.0015773223872491693,
"calib/step_q_w": 0.46510288065843614,
"calib/step_q_w_n": 243.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2506.0,
"completions/max_terminated_length": 2506.0,
"completions/mean_length": 481.74609375,
"completions/mean_terminated_length": 481.74609375,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.036088407039642334,
"kl": 0.041255950927734375,
"learning_rate": 4.416666666666667e-06,
"loss": -0.0088,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03467945009469986,
"mask/share_reasoning": 0.8499183654785156,
"mask/share_step_conf": 0.1154022216796875,
"num_tokens": 9720561.0,
"reward": 0.40249550342559814,
"reward_std": 0.17057295143604279,
"rewards/accuracy_reward_step": 0.7734375,
"rewards/final_brier_reward_step": 0.7933316826820374,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.34068435430526733,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7007678747177124,
"adv/mean_abs_reasoning": 0.38961368799209595,
"adv/mean_abs_step_conf": 0.7727140188217163,
"adv/ratio_final_to_reasoning": 1.798622318248554,
"adv/ratio_step_to_reasoning": 1.9832825248105561,
"adv/std_final_conf": 0.8853124976158142,
"adv/std_reasoning": 0.6815693974494934,
"adv/std_step_conf": 0.9356843829154968,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5324923547400612,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.39150197628458505,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9762845849802372,
"calib/gap": 0.020845438328236265,
"calib/mean_conf": 0.9606719367588933,
"calib/mu_c": 0.9696527777777777,
"calib/mu_w": 0.9488073394495414,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39150197628458505,
"calib/std_conf": 0.08246177149388098,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4925208913649025,
"calib/step_q_c_n": 718.0,
"calib/step_q_gap": 0.04903016623842693,
"calib/step_q_w": 0.4434907251264756,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2255.0,
"completions/max_terminated_length": 2255.0,
"completions/mean_length": 444.375,
"completions/mean_terminated_length": 444.375,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0448,
"grad_norm": 0.024456586688756943,
"kl": 0.043132781982421875,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0717,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03620729595422745,
"mask/share_reasoning": 0.8431611061096191,
"mask/share_step_conf": 0.12063158303499222,
"num_tokens": 9938689.0,
"reward": 0.2879253327846527,
"reward_std": 0.17871731519699097,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5978449583053589,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.3321504592895508,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7734534740447998,
"adv/mean_abs_reasoning": 0.571959376335144,
"adv/mean_abs_step_conf": 0.7615091800689697,
"adv/ratio_final_to_reasoning": 1.352287428174949,
"adv/ratio_step_to_reasoning": 1.3314043122229673,
"adv/std_final_conf": 0.9020565152168274,
"adv/std_reasoning": 0.7755232453346252,
"adv/std_step_conf": 0.9354322552680969,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5376262626262626,
"calib/avg_num_step_conf": 4.6171875,
"calib/ece": 0.39578740157480324,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9448818897637795,
"calib/gap": 0.009183080808080835,
"calib/mean_conf": 0.9548425196850394,
"calib/mu_c": 0.9588194444444444,
"calib/mu_w": 0.9496363636363636,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3918503937007875,
"calib/std_conf": 0.09487546127796324,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4688839285714285,
"calib/step_q_c_n": 672.0,
"calib/step_q_gap": 0.03392314425770304,
"calib/step_q_w": 0.4349607843137255,
"calib/step_q_w_n": 510.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1266.0,
"completions/max_terminated_length": 1266.0,
"completions/mean_length": 487.3359375,
"completions/mean_terminated_length": 489.2471008300781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.03911694511771202,
"kl": 0.039653778076171875,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0361,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.033634115010499954,
"mask/share_reasoning": 0.8571068644523621,
"mask/share_step_conf": 0.10535275936126709,
"num_tokens": 10168671.0,
"reward": 0.2883046865463257,
"reward_std": 0.23567301034927368,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.5909242033958435,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.324471116065979,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7604214549064636,
"adv/mean_abs_reasoning": 0.47109296917915344,
"adv/mean_abs_step_conf": 0.7571343183517456,
"adv/ratio_final_to_reasoning": 1.6141643044077794,
"adv/ratio_step_to_reasoning": 1.6071866232073029,
"adv/std_final_conf": 0.9101094603538513,
"adv/std_reasoning": 0.7392681837081909,
"adv/std_step_conf": 0.9356369972229004,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5641794775653044,
"calib/avg_num_step_conf": 4.9921875,
"calib/ece": 0.4446245059288537,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9090909090909091,
"calib/gap": 0.034324459442569455,
"calib/mean_conf": 0.9280237154150198,
"calib/mu_c": 0.9451181102362204,
"calib/mu_w": 0.9107936507936509,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4353359683794466,
"calib/std_conf": 0.1671832661987067,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.46154589371980675,
"calib/step_q_c_n": 621.0,
"calib/step_q_gap": 0.04247435642909142,
"calib/step_q_w": 0.41907153729071533,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2043.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 524.55078125,
"completions/mean_terminated_length": 526.6078491210938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.02774551697075367,
"kl": 0.0364837646484375,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0266,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030666790902614594,
"mask/share_reasoning": 0.863066554069519,
"mask/share_step_conf": 0.10236036777496338,
"num_tokens": 10409276.0,
"reward": 0.2592760920524597,
"reward_std": 0.21509483456611633,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5511636734008789,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.3294864296913147,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7134293913841248,
"adv/mean_abs_reasoning": 0.5957998633384705,
"adv/mean_abs_step_conf": 0.7652695178985596,
"adv/ratio_final_to_reasoning": 1.1974312773194271,
"adv/ratio_step_to_reasoning": 1.284440573065077,
"adv/std_final_conf": 0.9183647632598877,
"adv/std_reasoning": 0.8429709672927856,
"adv/std_step_conf": 0.9357597231864929,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5528199932455252,
"calib/avg_num_step_conf": 4.99609375,
"calib/ece": 0.38191056910569104,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9024390243902439,
"calib/gap": 0.036455927051671666,
"calib/mean_conf": 0.9339430894308943,
"calib/mu_c": 0.9495035460992908,
"calib/mu_w": 0.9130476190476191,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3713414634146341,
"calib/std_conf": 0.16610892698159163,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.46075,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": 0.04001654740608229,
"calib/step_q_w": 0.4207334525939177,
"calib/step_q_w_n": 559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3059.0,
"completions/max_terminated_length": 3059.0,
"completions/mean_length": 545.0703125,
"completions/mean_terminated_length": 545.0703125,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.048,
"grad_norm": 0.02425733208656311,
"kl": 0.03783416748046875,
"learning_rate": 4.305555555555556e-06,
"loss": 0.1195,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.034552380442619324,
"mask/share_reasoning": 0.8488144874572754,
"mask/share_step_conf": 0.11663312464952469,
"num_tokens": 10653862.0,
"reward": 0.28766316175460815,
"reward_std": 0.22457343339920044,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.5913242101669312,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.31756043434143066,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7256664037704468,
"adv/mean_abs_reasoning": 0.42020779848098755,
"adv/mean_abs_step_conf": 0.7499903440475464,
"adv/ratio_final_to_reasoning": 1.726922742494699,
"adv/ratio_step_to_reasoning": 1.7848082466786488,
"adv/std_final_conf": 0.91023188829422,
"adv/std_reasoning": 0.7013890147209167,
"adv/std_step_conf": 0.9354733824729919,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6028079710144928,
"calib/avg_num_step_conf": 5.28125,
"calib/ece": 0.4146000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.944,
"calib/gap": 0.0023071946169772595,
"calib/mean_conf": 0.96172,
"calib/mu_c": 0.9627536231884058,
"calib/mu_w": 0.9604464285714286,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4121600000000001,
"calib/std_conf": 0.04911050396809219,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47111267605633805,
"calib/step_q_c_n": 710.0,
"calib/step_q_gap": 0.07262357948312936,
"calib/step_q_w": 0.3984890965732087,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3026.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 529.46484375,
"completions/mean_terminated_length": 533.6338500976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.026741398498415947,
"kl": 0.0366668701171875,
"learning_rate": 4.277777777777778e-06,
"loss": -0.009,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035755470395088196,
"mask/share_reasoning": 0.8419286012649536,
"mask/share_step_conf": 0.114503413438797,
"num_tokens": 10894173.0,
"reward": 0.2640422582626343,
"reward_std": 0.17718663811683655,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.5698855519294739,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.34492605924606323,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.705195426940918,
"adv/mean_abs_reasoning": 0.385998398065567,
"adv/mean_abs_step_conf": 0.7689273953437805,
"adv/ratio_final_to_reasoning": 1.8269387398367676,
"adv/ratio_step_to_reasoning": 1.9920481514878408,
"adv/std_final_conf": 0.8877806663513184,
"adv/std_reasoning": 0.6613624691963196,
"adv/std_step_conf": 0.9357512593269348,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5420460816777042,
"calib/avg_num_step_conf": 4.8046875,
"calib/ece": 0.3363967611336034,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9190283400809717,
"calib/gap": 0.035028973509933725,
"calib/mean_conf": 0.9457894736842106,
"calib/mu_c": 0.9594039735099338,
"calib/mu_w": 0.9243750000000001,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.335425101214575,
"calib/std_conf": 0.11577971986890179,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.44354309165526673,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": 0.01596794135466556,
"calib/step_q_w": 0.42757515030060117,
"calib/step_q_w_n": 499.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2732.0,
"completions/max_terminated_length": 2732.0,
"completions/mean_length": 533.7734375,
"completions/mean_terminated_length": 537.9763793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.0264823567122221,
"kl": 0.037876129150390625,
"learning_rate": 4.25e-06,
"loss": -0.0354,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03163876757025719,
"mask/share_reasoning": 0.8584343194961548,
"mask/share_step_conf": 0.10211436450481415,
"num_tokens": 11136795.0,
"reward": 0.30292263627052307,
"reward_std": 0.16629716753959656,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.626888632774353,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.33119967579841614,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7307820320129395,
"adv/mean_abs_reasoning": 0.5326458215713501,
"adv/mean_abs_step_conf": 0.7509745955467224,
"adv/ratio_final_to_reasoning": 1.3719849145855887,
"adv/ratio_step_to_reasoning": 1.4098948403111171,
"adv/std_final_conf": 0.9083433151245117,
"adv/std_reasoning": 0.7754582166671753,
"adv/std_step_conf": 0.9357086420059204,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5705682619001857,
"calib/avg_num_step_conf": 4.54296875,
"calib/ece": 0.43048000000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.904,
"calib/gap": 0.020247293228265573,
"calib/mean_conf": 0.93896,
"calib/mu_c": 0.9487596899224805,
"calib/mu_w": 0.9285123966942149,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.42672000000000004,
"calib/std_conf": 0.11629496291757437,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.46645833333333336,
"calib/step_q_c_n": 576.0,
"calib/step_q_gap": 0.027105692788188585,
"calib/step_q_w": 0.4393526405451448,
"calib/step_q_w_n": 587.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 487.84765625,
"completions/mean_terminated_length": 489.76080322265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0512,
"grad_norm": 0.028510455042123795,
"kl": 0.040187835693359375,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0294,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.0355626717209816,
"mask/share_reasoning": 0.8490986824035645,
"mask/share_step_conf": 0.11143238097429276,
"num_tokens": 11365372.0,
"reward": 0.25411996245384216,
"reward_std": 0.2041478008031845,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5546382665634155,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.34249210357666016,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7097325921058655,
"adv/mean_abs_reasoning": 0.409004807472229,
"adv/mean_abs_step_conf": 0.7667136192321777,
"adv/ratio_final_to_reasoning": 1.7352671145657759,
"adv/ratio_step_to_reasoning": 1.8745833917471417,
"adv/std_final_conf": 0.8881635665893555,
"adv/std_reasoning": 0.6612932085990906,
"adv/std_step_conf": 0.9354108572006226,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6371844660194176,
"calib/avg_num_step_conf": 4.1953125,
"calib/ece": 0.36671936758893287,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9051383399209486,
"calib/gap": 0.026107443365695926,
"calib/mean_conf": 0.9568379446640316,
"calib/mu_c": 0.9674666666666667,
"calib/mu_w": 0.9413592233009708,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.3653359683794467,
"calib/std_conf": 0.06414750651290371,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4735439137134052,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.0058027372428169355,
"calib/step_q_w": 0.46774117647058827,
"calib/step_q_w_n": 425.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 468.19140625,
"completions/mean_terminated_length": 468.19140625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.027232561260461807,
"kl": 0.04390716552734375,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0188,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.035721421241760254,
"mask/share_reasoning": 0.8589103817939758,
"mask/share_step_conf": 0.10536816716194153,
"num_tokens": 11589765.0,
"reward": 0.3063098192214966,
"reward_std": 0.16605576872825623,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6214120984077454,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3197299838066101,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7404731512069702,
"adv/mean_abs_reasoning": 0.41314762830734253,
"adv/mean_abs_step_conf": 0.7684262990951538,
"adv/ratio_final_to_reasoning": 1.7922725449028323,
"adv/ratio_step_to_reasoning": 1.8599315267604968,
"adv/std_final_conf": 0.8928567171096802,
"adv/std_reasoning": 0.6613539457321167,
"adv/std_step_conf": 0.9354705810546875,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5829973118279569,
"calib/avg_num_step_conf": 4.6484375,
"calib/ece": 0.3234387351778656,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.924901185770751,
"calib/gap": 0.03274932795698937,
"calib/mean_conf": 0.953399209486166,
"calib/mu_c": 0.9654375000000002,
"calib/mu_w": 0.9326881720430108,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3222134387351779,
"calib/std_conf": 0.08894366205153649,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.48235543018335686,
"calib/step_q_c_n": 709.0,
"calib/step_q_gap": 0.049548777376704045,
"calib/step_q_w": 0.4328066528066528,
"calib/step_q_w_n": 481.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 478.578125,
"completions/mean_terminated_length": 480.4549255371094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.023459725081920624,
"kl": 0.042621612548828125,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0499,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03598196431994438,
"mask/share_reasoning": 0.8490711450576782,
"mask/share_step_conf": 0.1110406219959259,
"num_tokens": 11817641.0,
"reward": 0.32591211795806885,
"reward_std": 0.18485143780708313,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.656133234500885,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3261839747428894,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.6921615600585938,
"adv/mean_abs_reasoning": 0.459354043006897,
"adv/mean_abs_step_conf": 0.7920522093772888,
"adv/ratio_final_to_reasoning": 1.5068149950912728,
"adv/ratio_step_to_reasoning": 1.7242739482438747,
"adv/std_final_conf": 0.8642164468765259,
"adv/std_reasoning": 0.7392639517784119,
"adv/std_step_conf": 0.935680091381073,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6545796400752082,
"calib/avg_num_step_conf": 3.96484375,
"calib/ece": 0.3464516129032259,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8790322580645161,
"calib/gap": 0.08866236905721214,
"calib/mean_conf": 0.9257258064516131,
"calib/mu_c": 0.9621917808219179,
"calib/mu_w": 0.8735294117647058,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.34173387096774205,
"calib/std_conf": 0.18032977581005438,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4720717781402936,
"calib/step_q_c_n": 613.0,
"calib/step_q_gap": 0.03057924082686081,
"calib/step_q_w": 0.4414925373134328,
"calib/step_q_w_n": 402.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2298.0,
"completions/max_terminated_length": 2298.0,
"completions/mean_length": 499.36328125,
"completions/mean_terminated_length": 503.2952880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.0544,
"grad_norm": 0.02970803529024124,
"kl": 0.04041290283203125,
"learning_rate": 4.138888888888889e-06,
"loss": 0.04,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03323691338300705,
"mask/share_reasoning": 0.8658112287521362,
"mask/share_step_conf": 0.0931393951177597,
"num_tokens": 12054774.0,
"reward": 0.3221244513988495,
"reward_std": 0.17050030827522278,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6329628825187683,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.2941827178001404,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.729950487613678,
"adv/mean_abs_reasoning": 0.4381468892097473,
"adv/mean_abs_step_conf": 0.7624776363372803,
"adv/ratio_final_to_reasoning": 1.6659949108168608,
"adv/ratio_step_to_reasoning": 1.740232910731157,
"adv/std_final_conf": 0.8969921469688416,
"adv/std_reasoning": 0.7014232873916626,
"adv/std_step_conf": 0.9353620409965515,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6346685082872928,
"calib/avg_num_step_conf": 4.125,
"calib/ece": 0.22924901185770752,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8695652173913043,
"calib/gap": 0.08977747084100685,
"calib/mean_conf": 0.9245059288537549,
"calib/mu_c": 0.9500552486187845,
"calib/mu_w": 0.8602777777777777,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2191699604743083,
"calib/std_conf": 0.17271905082592476,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.46316421895861143,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.0243042841051912,
"calib/step_q_w": 0.43885993485342023,
"calib/step_q_w_n": 307.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1880.0,
"completions/max_terminated_length": 1880.0,
"completions/mean_length": 460.2890625,
"completions/mean_terminated_length": 460.2890625,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.030791480094194412,
"kl": 0.041439056396484375,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0236,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.036113008856773376,
"mask/share_reasoning": 0.861052393913269,
"mask/share_step_conf": 0.10283458977937698,
"num_tokens": 12280560.0,
"reward": 0.3836786150932312,
"reward_std": 0.18867164850234985,
"rewards/accuracy_reward_step": 0.70703125,
"rewards/final_brier_reward_step": 0.7272488474845886,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.29426658153533936,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7180808186531067,
"adv/mean_abs_reasoning": 0.3494745194911957,
"adv/mean_abs_step_conf": 0.7842679023742676,
"adv/ratio_final_to_reasoning": 2.0547444194173856,
"adv/ratio_step_to_reasoning": 2.244134718365428,
"adv/std_final_conf": 0.8857766389846802,
"adv/std_reasoning": 0.6403147578239441,
"adv/std_step_conf": 0.9356178045272827,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5780065359477125,
"calib/avg_num_step_conf": 4.1953125,
"calib/ece": 0.3447619047619049,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8849206349206349,
"calib/gap": 0.05336470588235298,
"calib/mean_conf": 0.9366666666666668,
"calib/mu_c": 0.9582666666666667,
"calib/mu_w": 0.9049019607843137,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3430952380952382,
"calib/std_conf": 0.1420065949731667,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45656151419558355,
"calib/step_q_c_n": 634.0,
"calib/step_q_gap": -0.0026430312589619254,
"calib/step_q_w": 0.4592045454545455,
"calib/step_q_w_n": 440.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2807.0,
"completions/max_terminated_length": 2807.0,
"completions/mean_length": 475.59765625,
"completions/mean_terminated_length": 477.4627685546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.03513414040207863,
"kl": 0.040142059326171875,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0345,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03485337644815445,
"mask/share_reasoning": 0.8665106296539307,
"mask/share_step_conf": 0.09472978115081787,
"num_tokens": 12508137.0,
"reward": 0.30111369490623474,
"reward_std": 0.19021180272102356,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.637919545173645,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.3497546315193176,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.6736820936203003,
"adv/mean_abs_reasoning": 0.2930399179458618,
"adv/mean_abs_step_conf": 0.784577488899231,
"adv/ratio_final_to_reasoning": 2.298943087148833,
"adv/ratio_step_to_reasoning": 2.677374107933579,
"adv/std_final_conf": 0.8704360723495483,
"adv/std_reasoning": 0.5726863145828247,
"adv/std_step_conf": 0.9355136752128601,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6744080145719489,
"calib/avg_num_step_conf": 4.109375,
"calib/ece": 0.22658823529411773,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.8980392156862745,
"calib/gap": 0.11522996357012771,
"calib/mean_conf": 0.9253333333333333,
"calib/mu_c": 0.9578688524590165,
"calib/mu_w": 0.8426388888888888,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.21713725490196087,
"calib/std_conf": 0.1841136335673148,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5020906432748539,
"calib/step_q_c_n": 684.0,
"calib/step_q_gap": 0.07888412153572338,
"calib/step_q_w": 0.4232065217391305,
"calib/step_q_w_n": 368.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1153.0,
"completions/max_terminated_length": 1153.0,
"completions/mean_length": 403.140625,
"completions/mean_terminated_length": 404.7215881347656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.0576,
"grad_norm": 0.040213968604803085,
"kl": 0.04549407958984375,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0123,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.04169517755508423,
"mask/share_reasoning": 0.8462526202201843,
"mask/share_step_conf": 0.10814596712589264,
"num_tokens": 12717573.0,
"reward": 0.3985499143600464,
"reward_std": 0.1534053236246109,
"rewards/accuracy_reward_step": 0.71484375,
"rewards/final_brier_reward_step": 0.7640405893325806,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l1_reward": -0.3091282844543457,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.6789047718048096,
"adv/mean_abs_reasoning": 0.43928492069244385,
"adv/mean_abs_step_conf": 0.763441801071167,
"adv/ratio_final_to_reasoning": 1.5454770692666926,
"adv/ratio_step_to_reasoning": 1.737919434766291,
"adv/std_final_conf": 0.8564179539680481,
"adv/std_reasoning": 0.7014631628990173,
"adv/std_step_conf": 0.9355975985527039,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.662468112244898,
"calib/avg_num_step_conf": 3.8046875,
"calib/ece": 0.3673412698412697,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.876984126984127,
"calib/gap": 0.11323214285714278,
"calib/mean_conf": 0.9206746031746031,
"calib/mu_c": 0.971,
"calib/mu_w": 0.8577678571428572,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3662301587301586,
"calib/std_conf": 0.20012481539267846,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5244223107569721,
"calib/step_q_c_n": 502.0,
"calib/step_q_gap": 0.06404095482476868,
"calib/step_q_w": 0.4603813559322034,
"calib/step_q_w_n": 472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2746.0,
"completions/max_terminated_length": 2746.0,
"completions/mean_length": 443.91015625,
"completions/mean_terminated_length": 443.91015625,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.033687423914670944,
"kl": 0.0462799072265625,
"learning_rate": 4.027777777777779e-06,
"loss": 0.0198,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03844766318798065,
"mask/share_reasoning": 0.8608040809631348,
"mask/share_step_conf": 0.10074827075004578,
"num_tokens": 12939038.0,
"reward": 0.3079288899898529,
"reward_std": 0.21377216279506683,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6218097805976868,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.311420738697052,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.6719927787780762,
"adv/mean_abs_reasoning": 0.5117623805999756,
"adv/mean_abs_step_conf": 0.7626398801803589,
"adv/ratio_final_to_reasoning": 1.3130953040945508,
"adv/ratio_step_to_reasoning": 1.4902226288815166,
"adv/std_final_conf": 0.8674094080924988,
"adv/std_reasoning": 0.7752949595451355,
"adv/std_step_conf": 0.9356803894042969,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6647956892680736,
"calib/avg_num_step_conf": 4.23046875,
"calib/ece": 0.42120000000000013,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9,
"calib/gap": 0.0558291102700621,
"calib/mean_conf": 0.9431200000000001,
"calib/mu_c": 0.9696946564885496,
"calib/mu_w": 0.9138655462184875,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.4201600000000001,
"calib/std_conf": 0.14516427108624216,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.48471666666666663,
"calib/step_q_c_n": 600.0,
"calib/step_q_gap": 0.019312939958592124,
"calib/step_q_w": 0.4654037267080745,
"calib/step_q_w_n": 483.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2375.0,
"completions/max_terminated_length": 2375.0,
"completions/mean_length": 444.25,
"completions/mean_terminated_length": 447.7480163574219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.040319155901670456,
"kl": 0.041660308837890625,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0712,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03685273975133896,
"mask/share_reasoning": 0.8518688678741455,
"mask/share_step_conf": 0.10346592217683792,
"num_tokens": 13159606.0,
"reward": 0.27113640308380127,
"reward_std": 0.19077391922473907,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.564152717590332,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.31875497102737427,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.6866100430488586,
"adv/mean_abs_reasoning": 0.5079214572906494,
"adv/mean_abs_step_conf": 0.753515362739563,
"adv/ratio_final_to_reasoning": 1.351803577488867,
"adv/ratio_step_to_reasoning": 1.483527328731018,
"adv/std_final_conf": 0.8826867341995239,
"adv/std_reasoning": 0.7755393981933594,
"adv/std_step_conf": 0.9357887506484985,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6132120796156486,
"calib/avg_num_step_conf": 3.859375,
"calib/ece": 0.33000000000000024,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.8875502008032129,
"calib/gap": 0.04507206588881285,
"calib/mean_conf": 0.9297590361445784,
"calib/mu_c": 0.9467741935483873,
"calib/mu_w": 0.9017021276595745,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.31863453815261067,
"calib/std_conf": 0.17179404805891435,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5024758842443731,
"calib/step_q_c_n": 622.0,
"calib/step_q_gap": 0.04072725036459163,
"calib/step_q_w": 0.46174863387978143,
"calib/step_q_w_n": 366.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2568.0,
"completions/max_terminated_length": 2568.0,
"completions/mean_length": 451.40625,
"completions/mean_terminated_length": 453.1764831542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.0608,
"grad_norm": 0.02778824418783188,
"kl": 0.039257049560546875,
"learning_rate": 3.972222222222223e-06,
"loss": 0.0263,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0373077429831028,
"mask/share_reasoning": 0.8600698709487915,
"mask/share_step_conf": 0.09871610999107361,
"num_tokens": 13381958.0,
"reward": 0.29894113540649414,
"reward_std": 0.21296796202659607,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6251699328422546,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l1_reward": -0.3366626501083374,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7430697679519653,
"adv/mean_abs_reasoning": 0.5540591478347778,
"adv/mean_abs_step_conf": 0.7561875581741333,
"adv/ratio_final_to_reasoning": 1.3411379829316545,
"adv/ratio_step_to_reasoning": 1.3648137768851183,
"adv/std_final_conf": 0.9170951843261719,
"adv/std_reasoning": 0.792834460735321,
"adv/std_step_conf": 0.9358760714530945,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.49554195222569697,
"calib/avg_num_step_conf": 4.5625,
"calib/ece": 0.42117886178861785,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.8577235772357723,
"calib/gap": 0.0005303080710626906,
"calib/mean_conf": 0.9112601626016261,
"calib/mu_c": 0.9115037593984964,
"calib/mu_w": 0.9109734513274337,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3958943089430894,
"calib/std_conf": 0.21098982125482804,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.4372477064220183,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": -0.0034137721772034557,
"calib/step_q_w": 0.4406614785992218,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2707.0,
"completions/max_terminated_length": 2707.0,
"completions/mean_length": 525.37109375,
"completions/mean_terminated_length": 525.37109375,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.03578052669763565,
"kl": 0.037464141845703125,
"learning_rate": 3.944444444444445e-06,
"loss": 0.013,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.0333830825984478,
"mask/share_reasoning": 0.8662627935409546,
"mask/share_step_conf": 0.10035405308008194,
"num_tokens": 13622773.0,
"reward": 0.2506124973297119,
"reward_std": 0.24518176913261414,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5477023124694824,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l1_reward": -0.3417898118495941,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.6008204221725464,
"adv/mean_abs_reasoning": 0.38778069615364075,
"adv/mean_abs_step_conf": 0.7656729221343994,
"adv/ratio_final_to_reasoning": 1.5493819783502019,
"adv/ratio_step_to_reasoning": 1.9744998390302435,
"adv/std_final_conf": 0.8101057410240173,
"adv/std_reasoning": 0.6613629460334778,
"adv/std_step_conf": 0.9356751441955566,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6365789473684211,
"calib/avg_num_step_conf": 3.82421875,
"calib/ece": 0.32876984126984143,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8571428571428571,
"calib/gap": 0.10785789473684226,
"calib/mean_conf": 0.910357142857143,
"calib/mu_c": 0.9531578947368423,
"calib/mu_w": 0.8453,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.31797619047619063,
"calib/std_conf": 0.22871160756093376,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.520053003533569,
"calib/step_q_c_n": 566.0,
"calib/step_q_gap": 0.08189319723816946,
"calib/step_q_w": 0.4381598062953995,
"calib/step_q_w_n": 413.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2707.0,
"completions/max_terminated_length": 2707.0,
"completions/mean_length": 473.37890625,
"completions/mean_terminated_length": 475.2353210449219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.051249004900455475,
"kl": 0.043796539306640625,
"learning_rate": 3.916666666666667e-06,
"loss": -0.0535,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03806016966700554,
"mask/share_reasoning": 0.8660639524459839,
"mask/share_step_conf": 0.09196965396404266,
"num_tokens": 13850206.0,
"reward": 0.3142896294593811,
"reward_std": 0.1821354180574417,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6513011455535889,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3375656306743622,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.717840313911438,
"adv/mean_abs_reasoning": 0.5838816165924072,
"adv/mean_abs_step_conf": 0.7559605836868286,
"adv/ratio_final_to_reasoning": 1.2294278386444626,
"adv/ratio_step_to_reasoning": 1.294715507740579,
"adv/std_final_conf": 0.8904332518577576,
"adv/std_reasoning": 0.8266067504882812,
"adv/std_step_conf": 0.9359144568443298,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6412456008044243,
"calib/avg_num_step_conf": 3.63671875,
"calib/ece": 0.3835177865612649,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8142292490118577,
"calib/gap": 0.10164844142785334,
"calib/mean_conf": 0.8696837944664033,
"calib/mu_c": 0.9166911764705883,
"calib/mu_w": 0.815042735042735,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.35782608695652185,
"calib/std_conf": 0.2793214158881204,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5305689277899344,
"calib/step_q_c_n": 457.0,
"calib/step_q_gap": 0.09343812610217067,
"calib/step_q_w": 0.4371308016877637,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2209.0,
"completions/max_terminated_length": 2209.0,
"completions/mean_length": 444.31640625,
"completions/mean_terminated_length": 446.058837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.064,
"grad_norm": 0.037136998027563095,
"kl": 0.04034423828125,
"learning_rate": 3.88888888888889e-06,
"loss": -0.017,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03838019073009491,
"mask/share_reasoning": 0.8667376041412354,
"mask/share_step_conf": 0.09097597002983093,
"num_tokens": 14072807.0,
"reward": 0.2943606972694397,
"reward_std": 0.2698456645011902,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6016941070556641,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.31375402212142944,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.6191959381103516,
"adv/mean_abs_reasoning": 0.4406328797340393,
"adv/mean_abs_step_conf": 0.7638034820556641,
"adv/ratio_final_to_reasoning": 1.4052422472060886,
"adv/ratio_step_to_reasoning": 1.7334237120858675,
"adv/std_final_conf": 0.8238195180892944,
"adv/std_reasoning": 0.7014070153236389,
"adv/std_step_conf": 0.935490071773529,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6044869131699211,
"calib/avg_num_step_conf": 3.875,
"calib/ece": 0.3202766798418971,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.924901185770751,
"calib/gap": 0.023579144162858667,
"calib/mean_conf": 0.9609881422924902,
"calib/mu_c": 0.9690963855421687,
"calib/mu_w": 0.94551724137931,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.31256916996047424,
"calib/std_conf": 0.12019504840785379,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5196694214876033,
"calib/step_q_c_n": 605.0,
"calib/step_q_gap": 0.08757639823178937,
"calib/step_q_w": 0.43209302325581395,
"calib/step_q_w_n": 387.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2827.0,
"completions/max_terminated_length": 2827.0,
"completions/mean_length": 390.91015625,
"completions/mean_terminated_length": 392.4431457519531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.03942104056477547,
"kl": 0.047290802001953125,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0588,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.043848633766174316,
"mask/share_reasoning": 0.8465343713760376,
"mask/share_step_conf": 0.10571075230836868,
"num_tokens": 14276944.0,
"reward": 0.3404572010040283,
"reward_std": 0.21853400766849518,
"rewards/accuracy_reward_step": 0.6484375,
"rewards/final_brier_reward_step": 0.6696093678474426,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.3152574300765991,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.6601003408432007,
"adv/mean_abs_reasoning": 0.473049134016037,
"adv/mean_abs_step_conf": 0.7604014277458191,
"adv/ratio_final_to_reasoning": 1.3954160220930083,
"adv/ratio_step_to_reasoning": 1.6074470347091694,
"adv/std_final_conf": 0.8655326962471008,
"adv/std_reasoning": 0.7394403219223022,
"adv/std_step_conf": 0.9359850287437439,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.504105899356195,
"calib/avg_num_step_conf": 3.796875,
"calib/ece": 0.46506072874493926,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.8866396761133604,
"calib/gap": -0.011488634870581937,
"calib/mean_conf": 0.9114574898785426,
"calib/mu_c": 0.905968992248062,
"calib/mu_w": 0.917457627118644,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.4271255060728745,
"calib/std_conf": 0.23152089388703107,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4835079726651481,
"calib/step_q_c_n": 439.0,
"calib/step_q_gap": 0.05711022407227756,
"calib/step_q_w": 0.42639774859287055,
"calib/step_q_w_n": 533.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 463.93359375,
"completions/mean_terminated_length": 467.58660888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.0323968380689621,
"kl": 0.046382904052734375,
"learning_rate": 3.833333333333334e-06,
"loss": -0.1007,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03665943443775177,
"mask/share_reasoning": 0.8647003173828125,
"mask/share_step_conf": 0.09082774817943573,
"num_tokens": 14502791.0,
"reward": 0.22184567153453827,
"reward_std": 0.23442932963371277,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5089241862297058,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.35585787892341614,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.645679235458374,
"adv/mean_abs_reasoning": 0.4358217418193817,
"adv/mean_abs_step_conf": 0.7689712643623352,
"adv/ratio_final_to_reasoning": 1.4815213962546272,
"adv/ratio_step_to_reasoning": 1.7644169406330839,
"adv/std_final_conf": 0.8309564590454102,
"adv/std_reasoning": 0.720628559589386,
"adv/std_step_conf": 0.9353896975517273,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6517218543046357,
"calib/avg_num_step_conf": 3.75390625,
"calib/ece": 0.3347675962815405,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.848605577689243,
"calib/gap": 0.09549426048565113,
"calib/mean_conf": 0.9161487383798143,
"calib/mu_c": 0.954194260485651,
"calib/mu_w": 0.8586999999999999,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.32466135458167333,
"calib/std_conf": 0.19615684930194882,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5264946619217081,
"calib/step_q_c_n": 562.0,
"calib/step_q_gap": 0.1052415290896278,
"calib/step_q_w": 0.4212531328320803,
"calib/step_q_w_n": 399.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2094.0,
"completions/max_terminated_length": 2094.0,
"completions/mean_length": 482.83203125,
"completions/mean_terminated_length": 482.83203125,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.0672,
"grad_norm": 0.039437055587768555,
"kl": 0.040195465087890625,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.008,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03730270266532898,
"mask/share_reasoning": 0.8711056113243103,
"mask/share_step_conf": 0.09159170091152191,
"num_tokens": 14735036.0,
"reward": 0.3450906574726105,
"reward_std": 0.19561327993869781,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6513279676437378,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.27364665269851685,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.6538460850715637,
"adv/mean_abs_reasoning": 0.47247079014778137,
"adv/mean_abs_step_conf": 0.7776767015457153,
"adv/ratio_final_to_reasoning": 1.3838867898416556,
"adv/ratio_step_to_reasoning": 1.645978371070242,
"adv/std_final_conf": 0.8448007702827454,
"adv/std_reasoning": 0.7207430005073547,
"adv/std_step_conf": 0.9357219338417053,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5599581339712919,
"calib/avg_num_step_conf": 3.90625,
"calib/ece": 0.2973809523809523,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8412698412698413,
"calib/gap": 0.025242224880382746,
"calib/mean_conf": 0.8846031746031746,
"calib/mu_c": 0.8922159090909091,
"calib/mu_w": 0.8669736842105263,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2417857142857142,
"calib/std_conf": 0.26720229181619426,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4425141242937854,
"calib/step_q_c_n": 708.0,
"calib/step_q_gap": -0.023410533240461218,
"calib/step_q_w": 0.4659246575342466,
"calib/step_q_w_n": 292.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1505.0,
"completions/max_terminated_length": 1505.0,
"completions/mean_length": 429.77734375,
"completions/mean_terminated_length": 431.4627685546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.044037114828825,
"kl": 0.04555511474609375,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0171,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03862915188074112,
"mask/share_reasoning": 0.8621880412101746,
"mask/share_step_conf": 0.09527651220560074,
"num_tokens": 14948835.0,
"reward": 0.33689361810684204,
"reward_std": 0.22495046257972717,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.6830945611000061,
"rewards/format_reward_step": 0.984375,
"rewards/step_l1_reward": -0.34368225932121277,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.5660860538482666,
"adv/mean_abs_reasoning": 0.3165215253829956,
"adv/mean_abs_step_conf": 0.7652155756950378,
"adv/ratio_final_to_reasoning": 1.7884598943572458,
"adv/ratio_step_to_reasoning": 2.4175783140470966,
"adv/std_final_conf": 0.7545972466468811,
"adv/std_reasoning": 0.5961599349975586,
"adv/std_step_conf": 0.935614824295044,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6484870269916999,
"calib/avg_num_step_conf": 3.40234375,
"calib/ece": 0.40028000000000014,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.94,
"calib/gap": 0.042424024573557206,
"calib/mean_conf": 0.9443600000000001,
"calib/mu_c": 0.9625174825174824,
"calib/mu_w": 0.9200934579439252,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3863200000000001,
"calib/std_conf": 0.18650627442528575,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5389227642276422,
"calib/step_q_c_n": 492.0,
"calib/step_q_gap": 0.04129743441233874,
"calib/step_q_w": 0.49762532981530344,
"calib/step_q_w_n": 379.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2430.0,
"completions/max_terminated_length": 2430.0,
"completions/mean_length": 371.63671875,
"completions/mean_terminated_length": 371.63671875,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.03307357802987099,
"kl": 0.0516510009765625,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0813,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.044516414403915405,
"mask/share_reasoning": 0.8548356294631958,
"mask/share_step_conf": 0.1006479412317276,
"num_tokens": 15148998.0,
"reward": 0.28827229142189026,
"reward_std": 0.18606629967689514,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.5883980393409729,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l1_reward": -0.31810346245765686,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.7129285335540771,
"adv/mean_abs_reasoning": 0.49174657464027405,
"adv/mean_abs_step_conf": 0.7922347187995911,
"adv/ratio_final_to_reasoning": 1.449788509611081,
"adv/ratio_step_to_reasoning": 1.6110630142754574,
"adv/std_final_conf": 0.8913588523864746,
"adv/std_reasoning": 0.7574918270111084,
"adv/std_step_conf": 0.9358580112457275,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.627749417029054,
"calib/avg_num_step_conf": 3.94140625,
"calib/ece": 0.4444444444444445,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8452380952380952,
"calib/gap": 0.05686519190773309,
"calib/mean_conf": 0.9134920634920636,
"calib/mu_c": 0.9426016260162603,
"calib/mu_w": 0.8857364341085272,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.43492063492063493,
"calib/std_conf": 0.21535064599546164,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5211084337349398,
"calib/step_q_c_n": 415.0,
"calib/step_q_gap": 0.15723637986288586,
"calib/step_q_w": 0.3638720538720539,
"calib/step_q_w_n": 594.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2373.0,
"completions/max_terminated_length": 2373.0,
"completions/mean_length": 471.4609375,
"completions/mean_terminated_length": 475.1732177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.0704,
"grad_norm": 0.04067553952336311,
"kl": 0.049777984619140625,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0223,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03776799142360687,
"mask/share_reasoning": 0.8647385835647583,
"mask/share_step_conf": 0.08968096226453781,
"num_tokens": 15376044.0,
"reward": 0.25311705470085144,
"reward_std": 0.2277534306049347,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.542452335357666,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3284057378768921,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.5803280472755432,
"adv/mean_abs_reasoning": 0.35292232036590576,
"adv/mean_abs_step_conf": 0.7632925510406494,
"adv/ratio_final_to_reasoning": 1.6443506510834052,
"adv/ratio_step_to_reasoning": 2.1627777757135807,
"adv/std_final_conf": 0.7728992700576782,
"adv/std_reasoning": 0.6401668190956116,
"adv/std_step_conf": 0.9356393814086914,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7155070754716982,
"calib/avg_num_step_conf": 3.546875,
"calib/ece": 0.3339607843137255,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.8745098039215686,
"calib/gap": 0.07929638364779867,
"calib/mean_conf": 0.9309019607843138,
"calib/mu_c": 0.960754716981132,
"calib/mu_w": 0.8814583333333333,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.32066666666666666,
"calib/std_conf": 0.1963000159721469,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.49503521126760563,
"calib/step_q_c_n": 568.0,
"calib/step_q_gap": 0.030358740679370333,
"calib/step_q_w": 0.4646764705882353,
"calib/step_q_w_n": 340.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2748.0,
"completions/max_terminated_length": 2748.0,
"completions/mean_length": 438.59375,
"completions/mean_terminated_length": 438.59375,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.0408489964902401,
"kl": 0.0499267578125,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0181,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.039613641798496246,
"mask/share_reasoning": 0.8700891733169556,
"mask/share_step_conf": 0.09029721468687057,
"num_tokens": 15593332.0,
"reward": 0.34359416365623474,
"reward_std": 0.1910526156425476,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.6591253876686096,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l1_reward": -0.2938120365142822,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.60542231798172,
"adv/mean_abs_reasoning": 0.42741620540618896,
"adv/mean_abs_step_conf": 0.7589120864868164,
"adv/ratio_final_to_reasoning": 1.4164702000626426,
"adv/ratio_step_to_reasoning": 1.7755809837991403,
"adv/std_final_conf": 0.8146088719367981,
"adv/std_reasoning": 0.7204663753509521,
"adv/std_step_conf": 0.9357614517211914,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7003043253043253,
"calib/avg_num_step_conf": 3.64453125,
"calib/ece": 0.3646613545816734,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9043824701195219,
"calib/gap": 0.1351100751100751,
"calib/mean_conf": 0.934382470119522,
"calib/mu_c": 0.9925174825174826,
"calib/mu_w": 0.8574074074074075,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3646613545816734,
"calib/std_conf": 0.19353885392746306,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5307739307535642,
"calib/step_q_c_n": 491.0,
"calib/step_q_gap": 0.13072868188478587,
"calib/step_q_w": 0.4000452488687783,
"calib/step_q_w_n": 442.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2438.0,
"completions/max_terminated_length": 2438.0,
"completions/mean_length": 438.5625,
"completions/mean_terminated_length": 438.5625,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.041994914412498474,
"kl": 0.048595428466796875,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0268,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.04383733496069908,
"mask/share_reasoning": 0.8566344380378723,
"mask/share_step_conf": 0.0995282307267189,
"num_tokens": 15809692.0,
"reward": 0.3205069899559021,
"reward_std": 0.2103608250617981,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6379590034484863,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3047575354576111,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.6797294616699219,
"adv/mean_abs_reasoning": 0.49723154306411743,
"adv/mean_abs_step_conf": 0.7593463659286499,
"adv/ratio_final_to_reasoning": 1.3670280398568189,
"adv/ratio_step_to_reasoning": 1.5271484211345239,
"adv/std_final_conf": 0.862686276435852,
"adv/std_reasoning": 0.7394139170646667,
"adv/std_step_conf": 0.9360142350196838,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6129368126504848,
"calib/avg_num_step_conf": 3.73046875,
"calib/ece": 0.4212096774193548,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8024193548387096,
"calib/gap": 0.06364807704821995,
"calib/mean_conf": 0.8887096774193549,
"calib/mu_c": 0.919763779527559,
"calib/mu_w": 0.856115702479339,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3989112903225806,
"calib/std_conf": 0.24823885396131076,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5252898550724638,
"calib/step_q_c_n": 414.0,
"calib/step_q_gap": 0.1472491896380831,
"calib/step_q_w": 0.37804066543438075,
"calib/step_q_w_n": 541.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 520.7265625,
"completions/mean_terminated_length": 520.7265625,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.0736,
"grad_norm": 0.057562634348869324,
"kl": 0.039585113525390625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.0314,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03592987731099129,
"mask/share_reasoning": 0.8848861455917358,
"mask/share_step_conf": 0.07918399572372437,
"num_tokens": 16047494.0,
"reward": 0.2596679627895355,
"reward_std": 0.2375001162290573,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.5565113425254822,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.3293628692626953,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.685692548751831,
"adv/mean_abs_reasoning": 0.4477848410606384,
"adv/mean_abs_step_conf": 0.7801724672317505,
"adv/ratio_final_to_reasoning": 1.531299155030966,
"adv/ratio_step_to_reasoning": 1.7422931633500756,
"adv/std_final_conf": 0.8694822788238525,
"adv/std_reasoning": 0.7392827272415161,
"adv/std_step_conf": 0.9357007145881653,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7277905785970301,
"calib/avg_num_step_conf": 3.78125,
"calib/ece": 0.34496000000000004,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.756,
"calib/gap": 0.2317869943676395,
"calib/mean_conf": 0.84424,
"calib/mu_c": 0.9592063492063491,
"calib/mu_w": 0.7274193548387096,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3426,
"calib/std_conf": 0.30037114109048496,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4912448132780083,
"calib/step_q_c_n": 482.0,
"calib/step_q_gap": 0.09167691204344036,
"calib/step_q_w": 0.3995679012345679,
"calib/step_q_w_n": 486.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2957.0,
"completions/max_terminated_length": 2957.0,
"completions/mean_length": 482.65625,
"completions/mean_terminated_length": 484.5490417480469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.05122093856334686,
"kl": 0.045070648193359375,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0169,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03974894434213638,
"mask/share_reasoning": 0.8590233325958252,
"mask/share_step_conf": 0.09732148051261902,
"num_tokens": 16278046.0,
"reward": 0.3146213889122009,
"reward_std": 0.2293742597103119,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6408277153968811,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.3037724494934082,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.698284924030304,
"adv/mean_abs_reasoning": 0.48682349920272827,
"adv/mean_abs_step_conf": 0.7600011825561523,
"adv/ratio_final_to_reasoning": 1.4343697976245733,
"adv/ratio_step_to_reasoning": 1.5611431736569983,
"adv/std_final_conf": 0.8622894287109375,
"adv/std_reasoning": 0.7394139170646667,
"adv/std_step_conf": 0.9357407093048096,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.636729396495782,
"calib/avg_num_step_conf": 4.40625,
"calib/ece": 0.3746987951807229,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.6987951807228916,
"calib/gap": 0.09127190136275143,
"calib/mean_conf": 0.8117269076305221,
"calib/mu_c": 0.8538805970149254,
"calib/mu_w": 0.7626086956521739,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.32413654618473897,
"calib/std_conf": 0.31846618748173583,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.40028148148148146,
"calib/step_q_c_n": 621.0,
"calib/step_q_gap": 0.020833749726057338,
"calib/step_q_w": 0.3794477317554241,
"calib/step_q_w_n": 507.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2925.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 481.77734375,
"completions/mean_terminated_length": 483.66668701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.043013013899326324,
"kl": 0.05849456787109375,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0153,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03930443152785301,
"mask/share_reasoning": 0.8558655977249146,
"mask/share_step_conf": 0.10092371702194214,
"num_tokens": 16505789.0,
"reward": 0.2700149714946747,
"reward_std": 0.21516259014606476,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6017140746116638,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l1_reward": -0.3593403697013855,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.713843822479248,
"adv/mean_abs_reasoning": 0.4635249078273773,
"adv/mean_abs_step_conf": 0.776650071144104,
"adv/ratio_final_to_reasoning": 1.5400333626625577,
"adv/ratio_step_to_reasoning": 1.6755303933598722,
"adv/std_final_conf": 0.8841903805732727,
"adv/std_reasoning": 0.7392738461494446,
"adv/std_step_conf": 0.9350262880325317,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7127551020408163,
"calib/avg_num_step_conf": 3.61328125,
"calib/ece": 0.28150277777777777,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.23904196428571434,
"calib/mean_conf": 0.7761162698412698,
"calib/mu_c": 0.8823571428571428,
"calib/mu_w": 0.6433151785714285,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.251031746031746,
"calib/std_conf": 0.3459573712344958,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5013888888888889,
"calib/step_q_c_n": 504.0,
"calib/step_q_gap": 0.09784969648983904,
"calib/step_q_w": 0.40353919239904984,
"calib/step_q_w_n": 421.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1327.0,
"completions/max_terminated_length": 1327.0,
"completions/mean_length": 411.609375,
"completions/mean_terminated_length": 411.609375,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.0768,
"grad_norm": 2.196476697921753,
"kl": 13.056953430175781,
"learning_rate": 3.555555555555556e-06,
"loss": 0.1237,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03887656331062317,
"mask/share_reasoning": 0.8669606447219849,
"mask/share_step_conf": 0.09416281431913376,
"num_tokens": 16715569.0,
"reward": 0.35540151596069336,
"reward_std": 0.21558502316474915,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6877049803733826,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l1_reward": -0.28158947825431824,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.749143123626709,
"adv/mean_abs_reasoning": 0.5188073515892029,
"adv/mean_abs_step_conf": 0.7993726134300232,
"adv/ratio_final_to_reasoning": 1.4439716810718757,
"adv/ratio_step_to_reasoning": 1.5407889093733869,
"adv/std_final_conf": 0.9182217717170715,
"adv/std_reasoning": 0.7577370405197144,
"adv/std_step_conf": 0.9359779357910156,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7088681849551414,
"calib/avg_num_step_conf": 3.578125,
"calib/ece": 0.2539442231075698,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6454183266932271,
"calib/gap": 0.20489993098688764,
"calib/mean_conf": 0.7803187250996017,
"calib/mu_c": 0.8537888198757765,
"calib/mu_w": 0.6488888888888888,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19641434262948212,
"calib/std_conf": 0.3351979524678122,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5200347826086956,
"calib/step_q_c_n": 575.0,
"calib/step_q_gap": 0.06804064771133495,
"calib/step_q_w": 0.4519941348973607,
"calib/step_q_w_n": 341.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2977.0,
"completions/max_terminated_length": 2977.0,
"completions/mean_length": 438.71875,
"completions/mean_terminated_length": 438.71875,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.06421296298503876,
"kl": 0.0592193603515625,
"learning_rate": 3.5277777777777784e-06,
"loss": 0.0348,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03809599205851555,
"mask/share_reasoning": 0.870836615562439,
"mask/share_step_conf": 0.09106739610433578,
"num_tokens": 16934913.0,
"reward": 0.36335694789886475,
"reward_std": 0.23348718881607056,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7183008193969727,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l1_reward": -0.3134618401527405,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.6810969114303589,
"adv/mean_abs_reasoning": 0.49199649691581726,
"adv/mean_abs_step_conf": 0.743833601474762,
"adv/ratio_final_to_reasoning": 1.3843531726342708,
"adv/ratio_step_to_reasoning": 1.5118676782002274,
"adv/std_final_conf": 0.8765449523925781,
"adv/std_reasoning": 0.7575379014015198,
"adv/std_step_conf": 0.9211689233779907,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6781118786401191,
"calib/avg_num_step_conf": 3.25390625,
"calib/ece": 0.2613877551020408,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5387755102040817,
"calib/gap": 0.23317215224163623,
"calib/mean_conf": 0.6765714285714286,
"calib/mu_c": 0.7784057971014493,
"calib/mu_w": 0.5452336448598131,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.18734693877551023,
"calib/std_conf": 0.3893307706559961,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5076681614349775,
"calib/step_q_c_n": 446.0,
"calib/step_q_gap": 0.11278444050474495,
"calib/step_q_w": 0.39488372093023255,
"calib/step_q_w_n": 387.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1365.0,
"completions/max_terminated_length": 1365.0,
"completions/mean_length": 415.296875,
"completions/mean_terminated_length": 418.5669250488281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.034008484333753586,
"kl": 0.0598907470703125,
"learning_rate": 3.5e-06,
"loss": -0.0063,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.04049193859100342,
"mask/share_reasoning": 0.8622183203697205,
"mask/share_step_conf": 0.08947721868753433,
"num_tokens": 17145157.0,
"reward": 0.33305859565734863,
"reward_std": 0.20567968487739563,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6704207062721252,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l1_reward": -0.301959753036499,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.6795423030853271,
"adv/mean_abs_reasoning": 0.4613600969314575,
"adv/mean_abs_step_conf": 0.7674668431282043,
"adv/ratio_final_to_reasoning": 1.4729108728843625,
"adv/ratio_step_to_reasoning": 1.6634876926563156,
"adv/std_final_conf": 0.8948934674263,
"adv/std_reasoning": 0.7393792271614075,
"adv/std_step_conf": 0.9358657002449036,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6866719872306465,
"calib/avg_num_step_conf": 3.3671875,
"calib/ece": 0.19417630522088364,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.2722870710295291,
"calib/mean_conf": 0.7721690763052209,
"calib/mu_c": 0.8487156424581005,
"calib/mu_w": 0.5764285714285714,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12373493975903624,
"calib/std_conf": 0.35158548802404604,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.558740635451505,
"calib/step_q_c_n": 598.0,
"calib/step_q_gap": 0.0989300293908989,
"calib/step_q_w": 0.4598106060606061,
"calib/step_q_w_n": 264.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1121.0,
"completions/max_terminated_length": 1121.0,
"completions/mean_length": 376.37109375,
"completions/mean_terminated_length": 377.8470764160156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.08,
"grad_norm": 0.06710561364889145,
"kl": 0.0687713623046875,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.0058,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.042312610894441605,
"mask/share_reasoning": 0.8570511341094971,
"mask/share_step_conf": 0.09672995656728745,
"num_tokens": 17346260.0,
"reward": 0.41363397240638733,
"reward_std": 0.2197515368461609,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.756236732006073,
"rewards/format_reward_step": 0.96875,
"rewards/step_l1_reward": -0.26256251335144043,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.7158702611923218,
"adv/mean_abs_reasoning": 0.4958776831626892,
"adv/mean_abs_step_conf": 0.772760808467865,
"adv/ratio_final_to_reasoning": 1.4436428286639724,
"adv/ratio_step_to_reasoning": 1.5583698051084405,
"adv/std_final_conf": 0.8861145377159119,
"adv/std_reasoning": 0.7576341032981873,
"adv/std_step_conf": 0.9360941648483276,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.648254985754986,
"calib/avg_num_step_conf": 3.0078125,
"calib/ece": 0.2856097560975609,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6300813008130082,
"calib/gap": 0.1681111111111111,
"calib/mean_conf": 0.746829268292683,
"calib/mu_c": 0.8083333333333333,
"calib/mu_w": 0.6402222222222222,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.19914634146341456,
"calib/std_conf": 0.36616961966127015,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5423432343234323,
"calib/step_q_c_n": 505.0,
"calib/step_q_gap": 0.08015191356871532,
"calib/step_q_w": 0.462191320754717,
"calib/step_q_w_n": 265.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2095.0,
"completions/max_terminated_length": 2095.0,
"completions/mean_length": 402.12890625,
"completions/mean_terminated_length": 403.7059020996094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.03836711868643761,
"kl": 0.0735931396484375,
"learning_rate": 3.444444444444445e-06,
"loss": -0.0136,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.04165346175432205,
"mask/share_reasoning": 0.8721697330474854,
"mask/share_step_conf": 0.08227050304412842,
"num_tokens": 17552261.0,
"reward": 0.33543461561203003,
"reward_std": 0.2189868986606598,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6719093918800354,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l1_reward": -0.3151026964187622,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7912461757659912,
"adv/mean_abs_reasoning": 0.5600335597991943,
"adv/mean_abs_step_conf": 0.763465404510498,
"adv/ratio_final_to_reasoning": 1.4128549297111774,
"adv/ratio_step_to_reasoning": 1.3632493823838812,
"adv/std_final_conf": 0.9182538390159607,
"adv/std_reasoning": 0.7931925058364868,
"adv/std_step_conf": 0.9360113143920898,
"calib/answer_extract_rate": 0.8828125,
"calib/auroc": 0.6853621730382293,
"calib/avg_num_step_conf": 3.203125,
"calib/ece": 0.21491150442477885,
"calib/final_conf_rate": 0.8828125,
"calib/format_rate": 0.8828125,
"calib/frac_conf_gt_0.9": 0.588495575221239,
"calib/gap": 0.30158786049631114,
"calib/mean_conf": 0.7150884955752213,
"calib/mu_c": 0.8271830985915493,
"calib/mu_w": 0.5255952380952381,
"calib/nonempty_final_conf_rate": 0.8828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.15084070796460186,
"calib/std_conf": 0.3765989406252195,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.508991935483871,
"calib/step_q_c_n": 496.0,
"calib/step_q_gap": 0.06880675029868583,
"calib/step_q_w": 0.4401851851851852,
"calib/step_q_w_n": 324.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2587.0,
"completions/max_terminated_length": 2587.0,
"completions/mean_length": 419.59765625,
"completions/mean_terminated_length": 421.2431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.04939044639468193,
"kl": 0.07457733154296875,
"learning_rate": 3.416666666666667e-06,
"loss": -0.0774,
"mask/has_final_conf_rate": 0.8828125,
"mask/share_final_conf": 0.04175189882516861,
"mask/share_reasoning": 0.8607625961303711,
"mask/share_step_conf": 0.0935792326927185,
"num_tokens": 17764342.0,
"reward": 0.3393831253051758,
"reward_std": 0.24981173872947693,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.669147253036499,
"rewards/format_reward_step": 0.8828125,
"rewards/step_l1_reward": -0.277880996465683,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7613974809646606,
"adv/mean_abs_reasoning": 0.5785947442054749,
"adv/mean_abs_step_conf": 0.7588104009628296,
"adv/ratio_final_to_reasoning": 1.31594261543148,
"adv/ratio_step_to_reasoning": 1.3114712993199178,
"adv/std_final_conf": 0.9037981033325195,
"adv/std_reasoning": 0.8100994229316711,
"adv/std_step_conf": 0.9359738826751709,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.6475569544364509,
"calib/avg_num_step_conf": 3.3125,
"calib/ece": 0.27897787234042565,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.6212765957446809,
"calib/gap": 0.17865948741007198,
"calib/mean_conf": 0.7504689361702128,
"calib/mu_c": 0.823453237410072,
"calib/mu_w": 0.64479375,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.21897872340425545,
"calib/std_conf": 0.34436838490837135,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.535946127946128,
"calib/step_q_c_n": 495.0,
"calib/step_q_gap": 0.10817275684131211,
"calib/step_q_w": 0.42777337110481584,
"calib/step_q_w_n": 353.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1923.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 443.90234375,
"completions/mean_terminated_length": 445.6431579589844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.0832,
"grad_norm": 0.05135057866573334,
"kl": 0.0748138427734375,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.0086,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.035105351358652115,
"mask/share_reasoning": 0.8789252042770386,
"mask/share_step_conf": 0.08206315338611603,
"num_tokens": 17986005.0,
"reward": 0.3178112506866455,
"reward_std": 0.2435327172279358,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6297679543495178,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l1_reward": -0.2832079827785492,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.8010854125022888,
"adv/mean_abs_reasoning": 0.695340633392334,
"adv/mean_abs_step_conf": 0.7600843906402588,
"adv/ratio_final_to_reasoning": 1.1520762257112194,
"adv/ratio_step_to_reasoning": 1.0931108497601552,
"adv/std_final_conf": 0.9360950589179993,
"adv/std_reasoning": 0.8907221555709839,
"adv/std_step_conf": 0.9353445172309875,
"calib/answer_extract_rate": 0.8828125,
"calib/auroc": 0.6016976953989994,
"calib/avg_num_step_conf": 3.15234375,
"calib/ece": 0.28291371681415933,
"calib/final_conf_rate": 0.8828125,
"calib/format_rate": 0.8671875,
"calib/frac_conf_gt_0.9": 0.5752212389380531,
"calib/gap": 0.15731760026244557,
"calib/mean_conf": 0.7022190265486726,
"calib/mu_c": 0.7641715328467152,
"calib/mu_w": 0.6068539325842697,
"calib/nonempty_final_conf_rate": 0.8828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1894690265486726,
"calib/std_conf": 0.38386494795215287,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5125550660792951,
"calib/step_q_c_n": 454.0,
"calib/step_q_gap": 0.026794914993365004,
"calib/step_q_w": 0.4857601510859301,
"calib/step_q_w_n": 353.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1029.0,
"completions/max_terminated_length": 1029.0,
"completions/mean_length": 431.9375,
"completions/mean_terminated_length": 433.63140869140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.051673173904418945,
"kl": 0.08280181884765625,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.1776,
"mask/has_final_conf_rate": 0.87890625,
"mask/share_final_conf": 0.03425498306751251,
"mask/share_reasoning": 0.8841665387153625,
"mask/share_step_conf": 0.07767222076654434,
"num_tokens": 18202957.0,
"reward": 0.29108428955078125,
"reward_std": 0.263746976852417,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.5915170907974243,
"rewards/format_reward_step": 0.8671875,
"rewards/step_l1_reward": -0.289817214012146,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.746005654335022,
"adv/mean_abs_reasoning": 0.6409313678741455,
"adv/mean_abs_step_conf": 0.7787002921104431,
"adv/ratio_final_to_reasoning": 1.1639399968976227,
"adv/ratio_step_to_reasoning": 1.214951133837079,
"adv/std_final_conf": 0.90828537940979,
"adv/std_reasoning": 0.8435416221618652,
"adv/std_step_conf": 0.9360520839691162,
"calib/answer_extract_rate": 0.80078125,
"calib/auroc": 0.7184604770352848,
"calib/avg_num_step_conf": 3.47265625,
"calib/ece": 0.3028407224958948,
"calib/final_conf_rate": 0.79296875,
"calib/format_rate": 0.78125,
"calib/frac_conf_gt_0.9": 0.7389162561576355,
"calib/gap": 0.2102201195873582,
"calib/mean_conf": 0.8577175697865353,
"calib/mu_c": 0.9498830409356728,
"calib/mu_w": 0.7396629213483146,
"calib/nonempty_final_conf_rate": 0.79296875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.29949096880131354,
"calib/std_conf": 0.264480346069493,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5575022988505747,
"calib/step_q_c_n": 435.0,
"calib/step_q_gap": 0.09140097726467161,
"calib/step_q_w": 0.46610132158590306,
"calib/step_q_w_n": 454.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2102.0,
"completions/max_terminated_length": 2102.0,
"completions/mean_length": 392.23046875,
"completions/mean_terminated_length": 392.23046875,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.07009822130203247,
"kl": 0.0958251953125,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.1183,
"mask/has_final_conf_rate": 0.79296875,
"mask/share_final_conf": 0.03364042192697525,
"mask/share_reasoning": 0.8711358904838562,
"mask/share_step_conf": 0.09522369503974915,
"num_tokens": 18405528.0,
"reward": 0.2661329507827759,
"reward_std": 0.23599350452423096,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.5544984340667725,
"rewards/format_reward_step": 0.78125,
"rewards/step_l1_reward": -0.26910752058029175,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.7740898728370667,
"adv/mean_abs_reasoning": 0.7841185331344604,
"adv/mean_abs_step_conf": 0.7297874093055725,
"adv/ratio_final_to_reasoning": 0.9872102751387537,
"adv/ratio_step_to_reasoning": 0.9307105730409113,
"adv/std_final_conf": 0.9220060706138611,
"adv/std_reasoning": 0.921212375164032,
"adv/std_step_conf": 0.9199628829956055,
"calib/answer_extract_rate": 0.55859375,
"calib/auroc": 0.6680855481727574,
"calib/avg_num_step_conf": 2.89453125,
"calib/ece": 0.27380281690140845,
"calib/final_conf_rate": 0.5546875,
"calib/format_rate": 0.55078125,
"calib/frac_conf_gt_0.9": 0.5915492957746479,
"calib/gap": 0.17347591362126247,
"calib/mean_conf": 0.7830985915492958,
"calib/mu_c": 0.8515116279069768,
"calib/mu_w": 0.6780357142857143,
"calib/nonempty_final_conf_rate": 0.5546875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.22563380281690143,
"calib/std_conf": 0.3208750867013859,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5628947368421053,
"calib/step_q_c_n": 266.0,
"calib/step_q_gap": 0.10815431578947377,
"calib/step_q_w": 0.45474042105263157,
"calib/step_q_w_n": 475.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2907.0,
"completions/max_terminated_length": 2907.0,
"completions/mean_length": 426.8046875,
"completions/mean_terminated_length": 428.47845458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.0864,
"grad_norm": 0.08157815784215927,
"kl": 0.08916473388671875,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.3737,
"mask/has_final_conf_rate": 0.5546875,
"mask/share_final_conf": 0.024003587663173676,
"mask/share_reasoning": 0.8975480794906616,
"mask/share_step_conf": 0.07454213500022888,
"num_tokens": 18621038.0,
"reward": 0.19895264506340027,
"reward_std": 0.2581195831298828,
"rewards/accuracy_reward_step": 0.33984375,
"rewards/final_brier_reward_step": 0.39003515243530273,
"rewards/format_reward_step": 0.55078125,
"rewards/step_l1_reward": -0.17025485634803772,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.7679874897003174,
"adv/mean_abs_reasoning": 0.7872791886329651,
"adv/mean_abs_step_conf": 0.7426701784133911,
"adv/ratio_final_to_reasoning": 0.9754957336467309,
"adv/ratio_step_to_reasoning": 0.9433377499829085,
"adv/std_final_conf": 0.9068247675895691,
"adv/std_reasoning": 0.9212319254875183,
"adv/std_step_conf": 0.9205906987190247,
"calib/answer_extract_rate": 0.46875,
"calib/auroc": 0.7012411347517731,
"calib/avg_num_step_conf": 2.69921875,
"calib/ece": 0.29076722689075624,
"calib/final_conf_rate": 0.46484375,
"calib/format_rate": 0.45703125,
"calib/frac_conf_gt_0.9": 0.680672268907563,
"calib/gap": 0.2214624408983451,
"calib/mean_conf": 0.7929302521008401,
"calib/mu_c": 0.8803986111111111,
"calib/mu_w": 0.658936170212766,
"calib/nonempty_final_conf_rate": 0.46484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.23932773109243693,
"calib/std_conf": 0.34122963399802164,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5550433884297522,
"calib/step_q_c_n": 242.0,
"calib/step_q_gap": 0.029007753685876936,
"calib/step_q_w": 0.5260356347438753,
"calib/step_q_w_n": 449.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 367.9296875,
"completions/mean_terminated_length": 367.9296875,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.0639415830373764,
"kl": 0.1104278564453125,
"learning_rate": 3.277777777777778e-06,
"loss": -0.5865,
"mask/has_final_conf_rate": 0.46484375,
"mask/share_final_conf": 0.021395236253738403,
"mask/share_reasoning": 0.9011713266372681,
"mask/share_step_conf": 0.07743342220783234,
"num_tokens": 18820780.0,
"reward": 0.16285640001296997,
"reward_std": 0.24152183532714844,
"rewards/accuracy_reward_step": 0.28125,
"rewards/final_brier_reward_step": 0.32856854796409607,
"rewards/format_reward_step": 0.45703125,
"rewards/step_l1_reward": -0.1505119949579239,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.5532522797584534,
"adv/mean_abs_reasoning": 0.6258186101913452,
"adv/mean_abs_step_conf": 0.6101732850074768,
"adv/ratio_final_to_reasoning": 0.8840457454425898,
"adv/ratio_step_to_reasoning": 0.9750002238203098,
"adv/std_final_conf": 0.7766796350479126,
"adv/std_reasoning": 0.8271476626396179,
"adv/std_step_conf": 0.8270086646080017,
"calib/answer_extract_rate": 0.32421875,
"calib/auroc": 0.7380116959064328,
"calib/avg_num_step_conf": 2.52734375,
"calib/ece": 0.27225301204819274,
"calib/final_conf_rate": 0.32421875,
"calib/format_rate": 0.3203125,
"calib/frac_conf_gt_0.9": 0.6024096385542169,
"calib/gap": 0.28448245614035106,
"calib/mean_conf": 0.7564216867469878,
"calib/mu_c": 0.8866666666666666,
"calib/mu_w": 0.6021842105263155,
"calib/nonempty_final_conf_rate": 0.32421875,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.24325301204819277,
"calib/std_conf": 0.35959224298701437,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5001329729729729,
"calib/step_q_c_n": 185.0,
"calib/step_q_gap": -0.03067763308763316,
"calib/step_q_w": 0.530810606060606,
"calib/step_q_w_n": 462.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1511.0,
"completions/max_terminated_length": 1511.0,
"completions/mean_length": 405.5234375,
"completions/mean_terminated_length": 407.1137390136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.06789255887269974,
"kl": 0.1085205078125,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.5658,
"mask/has_final_conf_rate": 0.32421875,
"mask/share_final_conf": 0.014378046616911888,
"mask/share_reasoning": 0.9094371795654297,
"mask/share_step_conf": 0.07227854430675507,
"num_tokens": 19031858.0,
"reward": 0.12394683808088303,
"reward_std": 0.16958144307136536,
"rewards/accuracy_reward_step": 0.17578125,
"rewards/final_brier_reward_step": 0.2288168966770172,
"rewards/format_reward_step": 0.3203125,
"rewards/step_l1_reward": -0.08014197647571564,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.40271973609924316,
"adv/mean_abs_reasoning": 0.4289896786212921,
"adv/mean_abs_step_conf": 0.4000370502471924,
"adv/ratio_final_to_reasoning": 0.9387632294406789,
"adv/ratio_step_to_reasoning": 0.93250973201232,
"adv/std_final_conf": 0.7011737823486328,
"adv/std_reasoning": 0.7209097743034363,
"adv/std_step_conf": 0.7012189626693726,
"calib/answer_extract_rate": 0.13671875,
"calib/auroc": 0.8480392156862745,
"calib/avg_num_step_conf": 1.84765625,
"calib/ece": 0.3411428571428571,
"calib/final_conf_rate": 0.13671875,
"calib/format_rate": 0.1328125,
"calib/frac_conf_gt_0.9": 0.42857142857142855,
"calib/gap": 0.30251633986928106,
"calib/mean_conf": 0.6897142857142857,
"calib/mu_c": 0.8452941176470589,
"calib/mu_w": 0.5427777777777778,
"calib/nonempty_final_conf_rate": 0.13671875,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.2725714285714285,
"calib/std_conf": 0.36861505832574143,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.4917241379310345,
"calib/step_q_c_n": 58.0,
"calib/step_q_gap": -0.11404293034205787,
"calib/step_q_w": 0.6057670682730923,
"calib/step_q_w_n": 415.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 913.0,
"completions/max_terminated_length": 913.0,
"completions/mean_length": 315.5,
"completions/mean_terminated_length": 316.7372741699219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.0896,
"grad_norm": 0.10951647907495499,
"kl": 0.1318359375,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.9894,
"mask/has_final_conf_rate": 0.13671875,
"mask/share_final_conf": 0.007692576386034489,
"mask/share_reasoning": 0.9166797995567322,
"mask/share_step_conf": 0.0717214047908783,
"num_tokens": 19218546.0,
"reward": 0.047167807817459106,
"reward_std": 0.09039464592933655,
"rewards/accuracy_reward_step": 0.06640625,
"rewards/final_brier_reward_step": 0.09505703300237656,
"rewards/format_reward_step": 0.1328125,
"rewards/step_l1_reward": -0.04056517034769058,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.22187048196792603,
"adv/mean_abs_reasoning": 0.25201520323753357,
"adv/mean_abs_step_conf": 0.24062125384807587,
"adv/ratio_final_to_reasoning": 0.8803853065912256,
"adv/ratio_step_to_reasoning": 0.9547886427362936,
"adv/std_final_conf": 0.5468955039978027,
"adv/std_reasoning": 0.5484980940818787,
"adv/std_step_conf": 0.5483436584472656,
"calib/answer_extract_rate": 0.08984375,
"calib/auroc": 0.6470588235294117,
"calib/avg_num_step_conf": 1.86328125,
"calib/ece": 0.5965217391304346,
"calib/final_conf_rate": 0.08984375,
"calib/format_rate": 0.0859375,
"calib/frac_conf_gt_0.9": 0.7391304347826086,
"calib/gap": 0.1454901960784314,
"calib/mean_conf": 0.8191304347826085,
"calib/mu_c": 0.9266666666666666,
"calib/mu_w": 0.7811764705882352,
"calib/nonempty_final_conf_rate": 0.08984375,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.5773913043478259,
"calib/std_conf": 0.3387176975579242,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.5032000000000001,
"calib/step_q_c_n": 25.0,
"calib/step_q_gap": -0.0639567109144541,
"calib/step_q_w": 0.5671567109144542,
"calib/step_q_w_n": 452.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2447.0,
"completions/max_terminated_length": 2447.0,
"completions/mean_length": 329.05078125,
"completions/mean_terminated_length": 329.05078125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.09789498895406723,
"kl": 0.1246795654296875,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.5054,
"mask/has_final_conf_rate": 0.08984375,
"mask/share_final_conf": 0.005012996960431337,
"mask/share_reasoning": 0.9164111614227295,
"mask/share_step_conf": 0.07857586443424225,
"num_tokens": 19410607.0,
"reward": 0.015317767858505249,
"reward_std": 0.0544668585062027,
"rewards/accuracy_reward_step": 0.0234375,
"rewards/final_brier_reward_step": 0.035423435270786285,
"rewards/format_reward_step": 0.0859375,
"rewards/step_l1_reward": -0.026662901043891907,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.08323132991790771,
"adv/mean_abs_reasoning": 0.08747018873691559,
"adv/mean_abs_step_conf": 0.08138076215982437,
"adv/ratio_final_to_reasoning": 0.9515393886738132,
"adv/ratio_step_to_reasoning": 0.9303828348260867,
"adv/std_final_conf": 0.33108845353126526,
"adv/std_reasoning": 0.3308155834674835,
"adv/std_step_conf": 0.33072537183761597,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 1.5703125,
"calib/ece": 0.3833333333333333,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.48400000000000004,
"calib/mean_conf": 0.47333333333333333,
"calib/mu_c": 0.554,
"calib/mu_w": 0.07,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.011666666666666667,
"calib/std_conf": 0.2590152290674988,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.32419354838709674,
"calib/step_q_c_n": 31.0,
"calib/step_q_gap": -0.26582531953743155,
"calib/step_q_w": 0.5900188679245283,
"calib/step_q_w_n": 371.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1241.0,
"completions/max_terminated_length": 1241.0,
"completions/mean_length": 309.203125,
"completions/mean_terminated_length": 311.6377868652344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.07561160624027252,
"kl": 0.138153076171875,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.2178,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0008216683054342866,
"mask/share_reasoning": 0.9200811386108398,
"mask/share_step_conf": 0.07128473371267319,
"num_tokens": 19595275.0,
"reward": 0.008734840899705887,
"reward_std": 0.01978309080004692,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.014989453367888927,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l1_reward": -0.005332270171493292,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.057972054928541183,
"adv/mean_abs_reasoning": 0.057895708829164505,
"adv/mean_abs_step_conf": 0.05789738893508911,
"adv/ratio_final_to_reasoning": 1.0013186832136722,
"adv/ratio_step_to_reasoning": 1.00002901952422,
"adv/std_final_conf": 0.2868097424507141,
"adv/std_reasoning": 0.2864321172237396,
"adv/std_step_conf": 0.2864404320716858,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 1.26953125,
"calib/ece": 0.010000000000000009,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.6666666666666666,
"calib/gap": 0.985,
"calib/mean_conf": 0.6566666666666666,
"calib/mu_c": 0.985,
"calib/mu_w": 0.0,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.0,
"calib/std_conf": 0.46435139950496784,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.32222222222222224,
"calib/step_q_c_n": 9.0,
"calib/step_q_gap": -0.3608895921237693,
"calib/step_q_w": 0.6831118143459916,
"calib/step_q_w_n": 316.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2022.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 243.0,
"completions/mean_terminated_length": 243.9529571533203,
"completions/min_length": 0.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.0928,
"grad_norm": 0.03810049220919609,
"kl": 0.1692962646484375,
"learning_rate": 3.138888888888889e-06,
"loss": -0.1218,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0005845353007316589,
"mask/share_reasoning": 0.9222879409790039,
"mask/share_step_conf": 0.0732213482260704,
"num_tokens": 19762979.0,
"reward": 0.005352574400603771,
"reward_std": 0.015139367431402206,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.011716797016561031,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.004917897284030914,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.02810167334973812,
"adv/mean_abs_reasoning": 0.02824888564646244,
"adv/mean_abs_step_conf": 0.027679258957505226,
"adv/ratio_final_to_reasoning": 0.9947887396845775,
"adv/ratio_step_to_reasoning": 0.979835427985155,
"adv/std_final_conf": 0.16561469435691833,
"adv/std_reasoning": 0.1653638482093811,
"adv/std_step_conf": 0.1653398722410202,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 1.2890625,
"calib/ece": 0.20333333333333334,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.20333333333333334,
"calib/mu_c": NaN,
"calib/mu_w": 0.20333333333333334,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.20333333333333334,
"calib/std_conf": 0.18116904322268257,
"calib/step_conf_rate": 0.921875,
"calib/step_q_w": 0.647889797979798,
"calib/step_q_w_n": 330.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1410.0,
"completions/max_terminated_length": 1410.0,
"completions/mean_length": 313.7265625,
"completions/mean_terminated_length": 313.7265625,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.02372133545577526,
"kl": 0.135498046875,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.0887,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0003868588828481734,
"mask/share_reasoning": 0.9381458163261414,
"mask/share_step_conf": 0.061467334628105164,
"num_tokens": 19953141.0,
"reward": 0.005528590641915798,
"reward_std": 0.007678534835577011,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.010849609039723873,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.002136178081855178,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.97265625,
"calib/ece": 0.95,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.95,
"calib/mu_c": NaN,
"calib/mu_w": 0.95,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.95,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_w": 0.712595983935743,
"calib/step_q_w_n": 249.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1346.0,
"completions/max_terminated_length": 1346.0,
"completions/mean_length": 292.59375,
"completions/mean_terminated_length": 294.89764404296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.0020648923236876726,
"kl": 0.142333984375,
"learning_rate": 3.0833333333333336e-06,
"loss": 0.0182,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 6.620762724196538e-05,
"mask/share_reasoning": 0.9293363094329834,
"mask/share_step_conf": 0.06278496980667114,
"num_tokens": 20136933.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 1.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_w": 0.6169270833333333,
"calib/step_q_w_n": 256.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1303.0,
"completions/max_terminated_length": 1303.0,
"completions/mean_length": 252.015625,
"completions/mean_terminated_length": 253.00393676757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.096,
"grad_norm": 0.001729490701109171,
"kl": 0.1618499755859375,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0186,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9174002408981323,
"mask/share_step_conf": 0.07869353890419006,
"num_tokens": 20304769.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.01932401768863201,
"adv/mean_abs_reasoning": 0.03858806565403938,
"adv/mean_abs_step_conf": 0.01931961625814438,
"adv/ratio_final_to_reasoning": 0.5007770501346491,
"adv/ratio_step_to_reasoning": 0.5006629881724068,
"adv/std_final_conf": 0.16558969020843506,
"adv/std_reasoning": 0.23381583392620087,
"adv/std_step_conf": 0.16555197536945343,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.9765625,
"calib/ece": 0.0050000000000000044,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.995,
"calib/mu_c": 0.995,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.0,
"calib/std_conf": 0.0050000000000000044,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.28,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": -0.3844979919678715,
"calib/step_q_w": 0.6644979919678715,
"calib/step_q_w_n": 249.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2736.0,
"completions/max_terminated_length": 2736.0,
"completions/mean_length": 262.5859375,
"completions/mean_terminated_length": 263.6156921386719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.027071869000792503,
"kl": 0.1643524169921875,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0465,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00029008882120251656,
"mask/share_reasoning": 0.9296793341636658,
"mask/share_step_conf": 0.06612434983253479,
"num_tokens": 20479703.0,
"reward": 0.001732663600705564,
"reward_std": 0.0049007125198841095,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.003905859310179949,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0027842819690704346,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.96484375,
"calib/ece": 0.31000000000000005,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.69,
"calib/mu_c": 0.69,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.953125,
"calib/step_q_w": 0.7439919028340082,
"calib/step_q_w_n": 247.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2197.0,
"completions/max_terminated_length": 2197.0,
"completions/mean_length": 253.13671875,
"completions/mean_terminated_length": 253.13671875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.0024913293309509754,
"kl": 0.16802978515625,
"learning_rate": 3e-06,
"loss": 0.0157,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.000139865733217448,
"mask/share_reasoning": 0.9277439117431641,
"mask/share_step_conf": 0.0721161812543869,
"num_tokens": 20651226.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.921875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/step_conf_rate": 0.921875,
"calib/step_q_w": 0.6648535310734464,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1561.0,
"completions/max_terminated_length": 1561.0,
"completions/mean_length": 252.34375,
"completions/mean_terminated_length": 252.34375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.0992,
"grad_norm": 0.0017887783469632268,
"kl": 0.1580810546875,
"learning_rate": 2.9722222222222225e-06,
"loss": 0.0193,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9293789863586426,
"mask/share_step_conf": 0.07062099128961563,
"num_tokens": 20821602.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.9296875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_w": 0.6646086834733892,
"calib/step_q_w_n": 238.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2464.0,
"completions/max_terminated_length": 2464.0,
"completions/mean_length": 262.765625,
"completions/mean_terminated_length": 262.765625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.00258839363232255,
"kl": 0.1856231689453125,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0206,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.930269181728363,
"mask/share_step_conf": 0.06973081827163696,
"num_tokens": 20997550.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.89453125,
"calib/ece": 0.64,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.64,
"calib/mu_c": NaN,
"calib/mu_w": 0.64,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.90234375,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.64,
"calib/std_conf": 0.34,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_w": 0.6451854439592429,
"calib/step_q_w_n": 229.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3054.0,
"completions/max_terminated_length": 3054.0,
"completions/mean_length": 277.64453125,
"completions/mean_terminated_length": 278.73333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.002397137461230159,
"kl": 0.14752960205078125,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0195,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00020610095816664398,
"mask/share_reasoning": 0.920891284942627,
"mask/share_step_conf": 0.07499632984399796,
"num_tokens": 21174755.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.921875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/step_conf_rate": 0.921875,
"calib/step_q_w": 0.685409604519774,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 975.0,
"completions/max_terminated_length": 975.0,
"completions/mean_length": 244.8984375,
"completions/mean_terminated_length": 244.8984375,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.1024,
"grad_norm": 0.0031177501659840345,
"kl": 0.1843109130859375,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0152,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9352742433547974,
"mask/share_step_conf": 0.06472573429346085,
"num_tokens": 21343265.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.94140625,
"calib/ece": 0.9,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.9,
"calib/mu_c": NaN,
"calib/mu_w": 0.9,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.9,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_w": 0.6480928077455048,
"calib/step_q_w_n": 241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2585.0,
"completions/max_terminated_length": 2585.0,
"completions/mean_length": 265.30859375,
"completions/mean_terminated_length": 265.30859375,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.002110776724293828,
"kl": 0.1783294677734375,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0197,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 8.387653360841796e-05,
"mask/share_reasoning": 0.9345265030860901,
"mask/share_step_conf": 0.06538967043161392,
"num_tokens": 21516256.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.88671875,
"calib/ece": 0.0,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.0,
"calib/mu_c": NaN,
"calib/mu_w": 0.0,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.88671875,
"calib/pce": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.88671875,
"calib/step_q_w": 0.6609838472834068,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1131.0,
"completions/max_terminated_length": 1131.0,
"completions/mean_length": 265.10546875,
"completions/mean_terminated_length": 268.2490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.0032506384886801243,
"kl": 0.165618896484375,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0207,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0001435853773728013,
"mask/share_reasoning": 0.9282910823822021,
"mask/share_step_conf": 0.05984655022621155,
"num_tokens": 21690307.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.87109375,
"calib/ece": 0.6261333333333332,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/mean_conf": 0.6261333333333333,
"calib/mu_c": NaN,
"calib/mu_w": 0.6261333333333333,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.87890625,
"calib/nonempty_step_conf_rate": 0.87109375,
"calib/pce": 0.6261333333333332,
"calib/std_conf": 0.4335165151281885,
"calib/step_conf_rate": 0.87109375,
"calib/step_q_w": 0.6548052316890881,
"calib/step_q_w_n": 223.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2595.0,
"completions/max_terminated_length": 2595.0,
"completions/mean_length": 307.17578125,
"completions/mean_terminated_length": 308.3804016113281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.1056,
"grad_norm": 0.0021076835691928864,
"kl": 0.1526947021484375,
"learning_rate": 2.805555555555556e-06,
"loss": 0.0201,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0004948556306771934,
"mask/share_reasoning": 0.9374754428863525,
"mask/share_step_conf": 0.05812348425388336,
"num_tokens": 21874744.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.88671875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.88671875,
"calib/nonempty_step_conf_rate": 0.88671875,
"calib/step_conf_rate": 0.88671875,
"calib/step_q_w": 0.6474894273127755,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2829.0,
"completions/max_terminated_length": 2829.0,
"completions/mean_length": 283.796875,
"completions/mean_terminated_length": 283.796875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.003983226139098406,
"kl": 0.1564483642578125,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0206,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9387972354888916,
"mask/share_step_conf": 0.06120274215936661,
"num_tokens": 22054804.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.890625,
"calib/ece": 0.12,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.12,
"calib/mu_c": NaN,
"calib/mu_w": 0.12,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.890625,
"calib/pce": 0.12,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.890625,
"calib/step_q_w": 0.7245760233918128,
"calib/step_q_w_n": 228.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2346.0,
"completions/max_terminated_length": 2346.0,
"completions/mean_length": 327.859375,
"completions/mean_terminated_length": 327.859375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.0020883805118501186,
"kl": 0.144989013671875,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0196,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 8.778089977568015e-05,
"mask/share_reasoning": 0.9369585514068604,
"mask/share_step_conf": 0.0629536435008049,
"num_tokens": 22245728.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.91015625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9140625,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_w": 0.6792072961373391,
"calib/step_q_w_n": 233.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1122.0,
"completions/max_terminated_length": 1122.0,
"completions/mean_length": 224.41796875,
"completions/mean_terminated_length": 224.41796875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.1088,
"grad_norm": 0.0024927027989178896,
"kl": 0.187957763671875,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0142,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9262990951538086,
"mask/share_step_conf": 0.0737009346485138,
"num_tokens": 22409875.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.90234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.90234375,
"calib/nonempty_step_conf_rate": 0.90234375,
"calib/step_conf_rate": 0.90234375,
"calib/step_q_w": 0.6472352092352092,
"calib/step_q_w_n": 231.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1993.0,
"completions/max_terminated_length": 1993.0,
"completions/mean_length": 315.05859375,
"completions/mean_terminated_length": 315.05859375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.002758257556706667,
"kl": 0.150848388671875,
"learning_rate": 2.6944444444444444e-06,
"loss": 0.0194,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9427634477615356,
"mask/share_step_conf": 0.05723656713962555,
"num_tokens": 22595082.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.8828125,
"calib/ece": 0.015,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.97,
"calib/mean_conf": 0.515,
"calib/mu_c": 1.0,
"calib/mu_w": 0.03,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.8828125,
"calib/pce": 0.015,
"calib/std_conf": 0.485,
"calib/step_conf_rate": 0.8828125,
"calib/step_q_w": 0.6190050147492626,
"calib/step_q_w_n": 226.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2761.0,
"completions/max_terminated_length": 2761.0,
"completions/mean_length": 265.98828125,
"completions/mean_terminated_length": 265.98828125,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.0026147959288209677,
"kl": 0.17083740234375,
"learning_rate": 2.666666666666667e-06,
"loss": 0.009,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00016867897647898644,
"mask/share_reasoning": 0.9309343099594116,
"mask/share_step_conf": 0.06889700889587402,
"num_tokens": 22769855.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 0.25,
"calib/avg_num_step_conf": 0.84765625,
"calib/ece": 0.33000000000000007,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.0050000000000000044,
"calib/mean_conf": 0.9966666666666667,
"calib/mu_c": 0.995,
"calib/mu_w": 1.0,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.859375,
"calib/nonempty_step_conf_rate": 0.84765625,
"calib/pce": 0.33000000000000007,
"calib/std_conf": 0.004714045207910321,
"calib/step_conf_rate": 0.84765625,
"calib/step_q_w": 0.6160026113671274,
"calib/step_q_w_n": 217.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2069.0,
"completions/max_terminated_length": 2069.0,
"completions/mean_length": 307.92578125,
"completions/mean_terminated_length": 309.13336181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.112,
"grad_norm": 0.01371827907860279,
"kl": 0.16375732421875,
"learning_rate": 2.6388888888888893e-06,
"loss": 0.0188,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0005875006900168955,
"mask/share_reasoning": 0.938085675239563,
"mask/share_step_conf": 0.05742061138153076,
"num_tokens": 22954444.0,
"reward": 0.0007812500116415322,
"reward_std": 0.00220970856025815,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.93359375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_w": 0.6228207810320782,
"calib/step_q_w_n": 239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 899.0,
"completions/max_terminated_length": 899.0,
"completions/mean_length": 256.625,
"completions/mean_terminated_length": 256.625,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.0019686499144881964,
"kl": 0.1787109375,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0197,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9375455975532532,
"mask/share_step_conf": 0.06245441734790802,
"num_tokens": 23124724.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.91796875,
"calib/ece": 1.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.0,
"calib/mu_c": 0.0,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_w": 0.6645486524822696,
"calib/step_q_w_n": 235.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1825.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 262.71875,
"completions/mean_terminated_length": 262.71875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.002731727436184883,
"kl": 0.17803955078125,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0128,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 5.9964364481857046e-05,
"mask/share_reasoning": 0.930545449256897,
"mask/share_step_conf": 0.06939459592103958,
"num_tokens": 23296596.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.89453125,
"calib/ece": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.0,
"calib/mu_c": NaN,
"calib/mu_w": 0.0,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_w": 0.6813867540029113,
"calib/step_q_w_n": 229.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1382.0,
"completions/max_terminated_length": 1382.0,
"completions/mean_length": 279.796875,
"completions/mean_terminated_length": 280.8941345214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.1152,
"grad_norm": 0.002479060785844922,
"kl": 0.161041259765625,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0199,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00010872266284422949,
"mask/share_reasoning": 0.9320110082626343,
"mask/share_step_conf": 0.06397401541471481,
"num_tokens": 23471456.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.9375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_w": 0.6219333333333333,
"calib/step_q_w_n": 240.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2597.0,
"completions/max_terminated_length": 2597.0,
"completions/mean_length": 295.53515625,
"completions/mean_terminated_length": 295.53515625,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.0018036911496892571,
"kl": 0.159149169921875,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0202,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9353963136672974,
"mask/share_step_conf": 0.06460371613502502,
"num_tokens": 23651713.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.9453125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_w": 0.7107644628099173,
"calib/step_q_w_n": 242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2162.0,
"completions/max_terminated_length": 2162.0,
"completions/mean_length": 246.1640625,
"completions/mean_terminated_length": 246.1640625,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.002016980666667223,
"kl": 0.1808013916015625,
"learning_rate": 2.5e-06,
"loss": 0.0193,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9226292371749878,
"mask/share_step_conf": 0.07737080752849579,
"num_tokens": 23819651.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.90625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/step_conf_rate": 0.90625,
"calib/step_q_w": 0.6728936781609195,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2647.0,
"completions/max_terminated_length": 2647.0,
"completions/mean_length": 301.8828125,
"completions/mean_terminated_length": 301.8828125,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.1184,
"grad_norm": 0.0021393022034317255,
"kl": 0.1673126220703125,
"learning_rate": 2.4722222222222226e-06,
"loss": 0.0199,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9377536177635193,
"mask/share_step_conf": 0.06224638968706131,
"num_tokens": 24004341.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.87890625,
"calib/ece": 0.035,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.9299999999999999,
"calib/mean_conf": 0.535,
"calib/mu_c": 1.0,
"calib/mu_w": 0.07,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.87890625,
"calib/pce": 0.035,
"calib/std_conf": 0.465,
"calib/step_conf_rate": 0.87890625,
"calib/step_q_w": 0.5784821250000001,
"calib/step_q_w_n": 224.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 318.13671875,
"completions/mean_terminated_length": 318.13671875,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.0022719684056937695,
"kl": 0.14599609375,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0105,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0002676117292139679,
"mask/share_reasoning": 0.9370044469833374,
"mask/share_step_conf": 0.06272794306278229,
"num_tokens": 24193704.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.90234375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.890625,
"calib/step_conf_rate": 0.890625,
"calib/step_q_w": 0.6473191919191921,
"calib/step_q_w_n": 231.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1491.0,
"completions/max_terminated_length": 1491.0,
"completions/mean_length": 225.890625,
"completions/mean_terminated_length": 225.890625,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.0022563741076737642,
"kl": 0.21331787109375,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0204,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9262930154800415,
"mask/share_step_conf": 0.0737069845199585,
"num_tokens": 24356732.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.90625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/step_conf_rate": 0.90625,
"calib/step_q_w": 0.6297916666666667,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2296.0,
"completions/max_terminated_length": 2296.0,
"completions/mean_length": 231.88671875,
"completions/mean_terminated_length": 232.7960968017578,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.1216,
"grad_norm": 0.0019388310611248016,
"kl": 0.197021484375,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0203,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9167957305908203,
"mask/share_step_conf": 0.07929803431034088,
"num_tokens": 24521119.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.8984375,
"calib/ece": 0.5,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.5,
"calib/mu_c": NaN,
"calib/mu_w": 0.5,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.91015625,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.5,
"calib/std_conf": 0.5,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_w": 0.5842565217391303,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 886.0,
"completions/max_terminated_length": 886.0,
"completions/mean_length": 237.8046875,
"completions/mean_terminated_length": 237.8046875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.0018869774648919702,
"kl": 0.196868896484375,
"learning_rate": 2.361111111111111e-06,
"loss": 0.021,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00013558330829255283,
"mask/share_reasoning": 0.9318229556083679,
"mask/share_step_conf": 0.06804148852825165,
"num_tokens": 24687261.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.88671875,
"calib/ece": 0.96,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.96,
"calib/mu_c": NaN,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.88671875,
"calib/pce": 0.96,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.88671875,
"calib/step_q_w": 0.57208046989721,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2479.0,
"completions/max_terminated_length": 2479.0,
"completions/mean_length": 273.47265625,
"completions/mean_terminated_length": 273.47265625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.0021785381250083447,
"kl": 0.1786956787109375,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0193,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 8.387653360841796e-05,
"mask/share_reasoning": 0.9253822565078735,
"mask/share_step_conf": 0.07453387975692749,
"num_tokens": 24861790.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.87890625,
"calib/ece": 0.765,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.765,
"calib/mu_c": NaN,
"calib/mu_w": 0.765,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.88671875,
"calib/nonempty_step_conf_rate": 0.87890625,
"calib/pce": 0.765,
"calib/std_conf": 0.015000000000000013,
"calib/step_conf_rate": 0.87890625,
"calib/step_q_w": 0.6962148148148147,
"calib/step_q_w_n": 225.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1893.0,
"completions/max_terminated_length": 1893.0,
"completions/mean_length": 263.1171875,
"completions/mean_terminated_length": 263.1171875,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.1248,
"grad_norm": 0.0013779336586594582,
"kl": 0.170562744140625,
"learning_rate": 2.305555555555556e-06,
"loss": 0.0194,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00015250130672939122,
"mask/share_reasoning": 0.9313722848892212,
"mask/share_step_conf": 0.06847520172595978,
"num_tokens": 25035748.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.9140625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_w": 0.6240760683760683,
"calib/step_q_w_n": 234.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2398.0,
"completions/max_terminated_length": 2398.0,
"completions/mean_length": 220.796875,
"completions/mean_terminated_length": 221.6627655029297,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.0024035677779465914,
"kl": 0.216339111328125,
"learning_rate": 2.277777777777778e-06,
"loss": 0.02,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9155505895614624,
"mask/share_step_conf": 0.0805431604385376,
"num_tokens": 25196280.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.90625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/step_conf_rate": 0.90625,
"calib/step_q_w": 0.6257777298850574,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1235.0,
"completions/max_terminated_length": 1235.0,
"completions/mean_length": 281.3515625,
"completions/mean_terminated_length": 281.3515625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.12693333333333334,
"grad_norm": 1.403180480003357,
"kl": 0.5548095703125,
"learning_rate": 2.25e-06,
"loss": 0.198,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9420675039291382,
"mask/share_step_conf": 0.057932544499635696,
"num_tokens": 25373370.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.94140625,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_w": 0.6420318118948825,
"calib/step_q_w_n": 241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1320.0,
"completions/max_terminated_length": 1320.0,
"completions/mean_length": 235.8515625,
"completions/mean_terminated_length": 235.8515625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.128,
"grad_norm": 0.0016463312786072493,
"kl": 0.202301025390625,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0194,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9314963817596436,
"mask/share_step_conf": 0.06850366294384003,
"num_tokens": 25540436.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.92578125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_w": 0.5966310829817159,
"calib/step_q_w_n": 237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 270.73046875,
"completions/mean_terminated_length": 271.79217529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.09108592569828033,
"kl": 0.239715576171875,
"learning_rate": 2.1944444444444445e-06,
"loss": 0.0368,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9250909090042114,
"mask/share_step_conf": 0.07100285589694977,
"num_tokens": 25714799.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.95703125,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_w": 0.6297778231292517,
"calib/step_q_w_n": 245.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2014.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 213.75390625,
"completions/mean_terminated_length": 213.75390625,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.015449677594006062,
"kl": 0.256256103515625,
"learning_rate": 2.166666666666667e-06,
"loss": 0.0222,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.924127459526062,
"mask/share_step_conf": 0.07587258517742157,
"num_tokens": 25876864.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.01880052126944065,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.019320303574204445,
"adv/ratio_final_to_reasoning": 0.9751095922596023,
"adv/ratio_step_to_reasoning": 1.0020686698297456,
"adv/std_final_conf": 0.16110378503799438,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.165557861328125,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.953125,
"calib/ece": 0.495,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.495,
"calib/mu_c": NaN,
"calib/mu_w": 0.495,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.9140625,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.495,
"calib/std_conf": 0.495,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_w": 0.540677868852459,
"calib/step_q_w_n": 244.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1183.0,
"completions/max_terminated_length": 1183.0,
"completions/mean_length": 247.47265625,
"completions/mean_terminated_length": 247.47265625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.1312,
"grad_norm": 0.01294808741658926,
"kl": 0.2294769287109375,
"learning_rate": 2.138888888888889e-06,
"loss": -0.0594,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00017276535800192505,
"mask/share_reasoning": 0.932379424571991,
"mask/share_step_conf": 0.06744778901338577,
"num_tokens": 26045505.0,
"reward": -0.0010277825640514493,
"reward_std": 0.0029070081654936075,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 7.773437391733751e-05,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0029145495500415564,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.8984375,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.8984375,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_w": 0.5718039130434783,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 995.0,
"completions/max_terminated_length": 995.0,
"completions/mean_length": 210.28125,
"completions/mean_terminated_length": 210.28125,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.001803459133952856,
"kl": 0.2413482666015625,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.0208,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9198349118232727,
"mask/share_step_conf": 0.08016512542963028,
"num_tokens": 26206153.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.01880052126944065,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.01928030140697956,
"adv/ratio_final_to_reasoning": 0.32503653075320077,
"adv/ratio_step_to_reasoning": 0.33333130456266014,
"adv/std_final_conf": 0.16110378503799438,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.16521507501602173,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.875,
"calib/ece": 0.3225,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.3550000000000001,
"calib/mean_conf": 0.7675,
"calib/mu_c": 0.9450000000000001,
"calib/mu_w": 0.59,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.8828125,
"calib/nonempty_step_conf_rate": 0.87109375,
"calib/pce": 0.295,
"calib/std_conf": 0.3361826140656295,
"calib/step_conf_rate": 0.87109375,
"calib/step_q_w": 0.5149190476190476,
"calib/step_q_w_n": 224.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1395.0,
"completions/max_terminated_length": 1395.0,
"completions/mean_length": 230.515625,
"completions/mean_terminated_length": 231.41961669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.04646065831184387,
"kl": 0.235107421875,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.0719,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0006107184453867376,
"mask/share_reasoning": 0.9152103662490845,
"mask/share_step_conf": 0.08027268201112747,
"num_tokens": 26369973.0,
"reward": 0.0008209494408220053,
"reward_std": 0.002321995561942458,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 7.773437391733751e-05,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0007795855053700507,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.890625,
"calib/ece": 0.7333333333333334,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": -0.30000000000000004,
"calib/mean_conf": 0.7999999999999999,
"calib/mu_c": 0.6,
"calib/mu_w": 0.9,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.8984375,
"calib/nonempty_step_conf_rate": 0.8828125,
"calib/pce": 0.6,
"calib/std_conf": 0.16329931618554522,
"calib/step_conf_rate": 0.8828125,
"calib/step_q_w": 0.5426749999999999,
"calib/step_q_w_n": 228.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1117.0,
"completions/max_terminated_length": 1117.0,
"completions/mean_length": 184.12109375,
"completions/mean_terminated_length": 184.12109375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.1344,
"grad_norm": 0.004394296556711197,
"kl": 0.2568359375,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.005,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0003749749739654362,
"mask/share_reasoning": 0.9051406383514404,
"mask/share_step_conf": 0.09448444843292236,
"num_tokens": 26522572.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.921875,
"calib/ece": 0.45999999999999996,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.05999999999999994,
"calib/mean_conf": 0.96,
"calib/mu_c": 0.99,
"calib/mu_w": 0.93,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.45999999999999996,
"calib/std_conf": 0.02999999999999997,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_w": 0.5421467514124293,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1589.0,
"completions/max_terminated_length": 1589.0,
"completions/mean_length": 188.64453125,
"completions/mean_terminated_length": 188.64453125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.009678676724433899,
"kl": 0.27178955078125,
"learning_rate": 2.027777777777778e-06,
"loss": 0.0113,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00019561882072594017,
"mask/share_reasoning": 0.9013167023658752,
"mask/share_step_conf": 0.09848769754171371,
"num_tokens": 26674537.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.88671875,
"calib/ece": 0.845,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": -0.69,
"calib/mean_conf": 0.615,
"calib/mu_c": 0.27,
"calib/mu_w": 0.96,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.890625,
"calib/nonempty_step_conf_rate": 0.8828125,
"calib/pce": 0.48,
"calib/std_conf": 0.345,
"calib/step_conf_rate": 0.8828125,
"calib/step_q_w": 0.5847447870778268,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2550.0,
"completions/max_terminated_length": 2550.0,
"completions/mean_length": 255.796875,
"completions/mean_terminated_length": 255.796875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.003527594031766057,
"kl": 0.225738525390625,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.015,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.00018033437663689256,
"mask/share_reasoning": 0.9266326427459717,
"mask/share_step_conf": 0.07318704575300217,
"num_tokens": 26846685.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.921875,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.9140625,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_w": 0.5749310734463278,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1232.0,
"completions/max_terminated_length": 1232.0,
"completions/mean_length": 187.78515625,
"completions/mean_terminated_length": 187.78515625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.1376,
"grad_norm": 0.0018569625681266189,
"kl": 0.26055908203125,
"learning_rate": 1.9722222222222224e-06,
"loss": 0.0208,
"mask/has_final_conf_rate": 0.0,
"mask/share_final_conf": 0.0,
"mask/share_reasoning": 0.9001195430755615,
"mask/share_step_conf": 0.09988044202327728,
"num_tokens": 26997142.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.019323350861668587,
"adv/mean_abs_reasoning": 0.07714889943599701,
"adv/mean_abs_step_conf": 0.019258743152022362,
"adv/ratio_final_to_reasoning": 0.25046826335739636,
"adv/ratio_step_to_reasoning": 0.24963082160361186,
"adv/std_final_conf": 0.16558398306369781,
"adv/std_reasoning": 0.33054885268211365,
"adv/std_step_conf": 0.1650303304195404,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.95703125,
"calib/ece": 0.2425,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.7575000000000001,
"calib/mu_c": 0.7575000000000001,
"calib/mu_w": NaN,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.0,
"calib/std_conf": 0.2621426138574192,
"calib/step_conf_rate": 0.90625,
"calib/step_q_c": 0.55,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.0374379781420765,
"calib/step_q_w": 0.5125620218579235,
"calib/step_q_w_n": 244.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1036.0,
"completions/max_terminated_length": 1036.0,
"completions/mean_length": 198.6640625,
"completions/mean_terminated_length": 198.6640625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.029257914051413536,
"kl": 0.258148193359375,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0872,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0011932657798752189,
"mask/share_reasoning": 0.9098485708236694,
"mask/share_step_conf": 0.08895816653966904,
"num_tokens": 27153288.0,
"reward": 0.0035145406145602465,
"reward_std": 0.009940622374415398,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0036812499165534973,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.000558418920263648,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.038646847009658813,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.0379277728497982,
"adv/ratio_final_to_reasoning": 1.002230488442963,
"adv/ratio_step_to_reasoning": 0.9835827046720463,
"adv/std_final_conf": 0.23417198657989502,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.22983452677726746,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.96875,
"calib/ece": 0.53,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.25,
"calib/mean_conf": 0.53,
"calib/mu_c": NaN,
"calib/mu_w": 0.53,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.53,
"calib/std_conf": 0.3463379852109786,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_w": 0.5664596774193549,
"calib/step_q_w_n": 248.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2109.0,
"completions/max_terminated_length": 2109.0,
"completions/mean_length": 190.38671875,
"completions/mean_terminated_length": 190.38671875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.027975257486104965,
"kl": 0.25921630859375,
"learning_rate": 1.916666666666667e-06,
"loss": -0.1447,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0005259591853246093,
"mask/share_reasoning": 0.9087549448013306,
"mask/share_step_conf": 0.09071913361549377,
"num_tokens": 27308235.0,
"reward": 0.004286515526473522,
"reward_std": 0.012124096974730492,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.00742187537252903,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.00041134387720376253,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.057708740234375,
"adv/mean_abs_reasoning": 0.0578957125544548,
"adv/mean_abs_step_conf": 0.0578995943069458,
"adv/ratio_final_to_reasoning": 0.9967705325346166,
"adv/ratio_step_to_reasoning": 1.0000670473221545,
"adv/std_final_conf": 0.2855128347873688,
"adv/std_reasoning": 0.2864321172237396,
"adv/std_step_conf": 0.2864512503147125,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.94921875,
"calib/ece": 0.505,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": -0.010000000000000009,
"calib/mean_conf": 0.72,
"calib/mu_c": 0.715,
"calib/mu_w": 0.725,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.3625,
"calib/std_conf": 0.18069310999592653,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.7,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.16071535269709536,
"calib/step_q_w": 0.5392846473029046,
"calib/step_q_w_n": 241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1597.0,
"completions/max_terminated_length": 1597.0,
"completions/mean_length": 198.7109375,
"completions/mean_terminated_length": 198.7109375,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.1408,
"grad_norm": 0.05701402947306633,
"kl": 0.2595062255859375,
"learning_rate": 1.888888888888889e-06,
"loss": -0.239,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0011298644822090864,
"mask/share_reasoning": 0.8932276964187622,
"mask/share_step_conf": 0.1056424006819725,
"num_tokens": 27464697.0,
"reward": 0.003786170156672597,
"reward_std": 0.011870051734149456,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.007330859545618296,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.0036647694651037455,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.9375,
"calib/ece": 0.74,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.74,
"calib/mu_c": NaN,
"calib/mu_w": 0.74,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.74,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_w": 0.5189918055555557,
"calib/step_q_w_n": 240.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1224.0,
"completions/max_terminated_length": 1224.0,
"completions/mean_length": 179.5078125,
"completions/mean_terminated_length": 179.5078125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.002950991503894329,
"kl": 0.274261474609375,
"learning_rate": 1.8611111111111113e-06,
"loss": 0.021,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00012317004438955337,
"mask/share_reasoning": 0.9076688289642334,
"mask/share_step_conf": 0.09220802038908005,
"num_tokens": 27616995.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.038023941218853,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.038561079651117325,
"adv/ratio_final_to_reasoning": 0.98607664347814,
"adv/ratio_step_to_reasoning": 1.000006279528274,
"adv/std_final_conf": 0.23041187226772308,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.23365232348442078,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.890625,
"calib/ece": 0.97,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.97,
"calib/mu_c": NaN,
"calib/mu_w": 0.97,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.875,
"calib/nonempty_step_conf_rate": 0.875,
"calib/pce": 0.97,
"calib/std_conf": 0.020000000000000018,
"calib/step_conf_rate": 0.875,
"calib/step_q_w": 0.5715896198830409,
"calib/step_q_w_n": 228.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2362.0,
"completions/max_terminated_length": 2362.0,
"completions/mean_length": 226.83984375,
"completions/mean_terminated_length": 226.83984375,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.023352211341261864,
"kl": 0.225128173828125,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.0386,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0007759142317809165,
"mask/share_reasoning": 0.9204539060592651,
"mask/share_step_conf": 0.07877011597156525,
"num_tokens": 27784018.0,
"reward": 0.00019159464864060283,
"reward_std": 0.0005419114604592323,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0004585937422234565,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0016379044391214848,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.03847457841038704,
"adv/mean_abs_reasoning": 0.03858806565403938,
"adv/mean_abs_step_conf": 0.03789190202951431,
"adv/ratio_final_to_reasoning": 0.9970590066713939,
"adv/ratio_step_to_reasoning": 0.9819590950537269,
"adv/std_final_conf": 0.23313045501708984,
"adv/std_reasoning": 0.23381583392620087,
"adv/std_step_conf": 0.22963960468769073,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.98046875,
"calib/ece": 0.5566666666666666,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.014999999999999902,
"calib/mean_conf": 0.6900000000000001,
"calib/mu_c": 0.7,
"calib/mu_w": 0.685,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.45666666666666667,
"calib/std_conf": 0.2328089345364563,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.98,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.46262933333333334,
"calib/step_q_w": 0.5173706666666666,
"calib/step_q_w_n": 250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2731.0,
"completions/max_terminated_length": 2731.0,
"completions/mean_length": 213.55859375,
"completions/mean_terminated_length": 213.55859375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.144,
"grad_norm": 0.03867189958691597,
"kl": 0.24249267578125,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.137,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0007803092012181878,
"mask/share_reasoning": 0.9052779674530029,
"mask/share_step_conf": 0.09394178539514542,
"num_tokens": 27944569.0,
"reward": 0.002227193210273981,
"reward_std": 0.008019620552659035,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0037855468690395355,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0016749105416238308,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.057971805334091187,
"adv/mean_abs_reasoning": 0.07720336318016052,
"adv/mean_abs_step_conf": 0.05711377039551735,
"adv/ratio_final_to_reasoning": 0.75089740843037,
"adv/ratio_step_to_reasoning": 0.7397834503949987,
"adv/std_final_conf": 0.28680849075317383,
"adv/std_reasoning": 0.3307822048664093,
"adv/std_step_conf": 0.2825852930545807,
"calib/answer_extract_rate": 0.01953125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.96484375,
"calib/ece": 0.057999999999999996,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.6,
"calib/gap": 0.8525,
"calib/mean_conf": 0.7819999999999999,
"calib/mu_c": 0.9525,
"calib/mu_w": 0.1,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.020000000000000004,
"calib/std_conf": 0.34510288321021027,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.7666666666666666,
"calib/step_q_c_n": 3.0,
"calib/step_q_gap": 0.22086092896174858,
"calib/step_q_w": 0.545805737704918,
"calib/step_q_w_n": 244.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1040.0,
"completions/max_terminated_length": 1040.0,
"completions/mean_length": 171.26953125,
"completions/mean_terminated_length": 171.94119262695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.021164115518331528,
"kl": 0.27850341796875,
"learning_rate": 1.777777777777778e-06,
"loss": -0.1423,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.001021144213154912,
"mask/share_reasoning": 0.8896456956863403,
"mask/share_step_conf": 0.10542689263820648,
"num_tokens": 28096902.0,
"reward": 0.008188535459339619,
"reward_std": 0.023160677403211594,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.01162890624254942,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.0007205858128145337,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.038537055253982544,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.038625482469797134,
"adv/ratio_final_to_reasoning": 0.9993832537153554,
"adv/ratio_step_to_reasoning": 1.0016764408329322,
"adv/std_final_conf": 0.2335069626569748,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.23404252529144287,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 1.02734375,
"calib/ece": 0.6466666666666666,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/mean_conf": 0.6466666666666666,
"calib/mu_c": NaN,
"calib/mu_w": 0.6466666666666666,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.6466666666666666,
"calib/std_conf": 0.3531131389355101,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_w": 0.5505894803548796,
"calib/step_q_w_n": 263.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2118.0,
"completions/max_terminated_length": 2118.0,
"completions/mean_length": 186.296875,
"completions/mean_terminated_length": 186.296875,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.027069352567195892,
"kl": 0.2666015625,
"learning_rate": 1.75e-06,
"loss": -0.0801,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0010379692539572716,
"mask/share_reasoning": 0.8899767398834229,
"mask/share_step_conf": 0.10898531973361969,
"num_tokens": 28251578.0,
"reward": -0.0004488623235374689,
"reward_std": 0.001269574393518269,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0015386719023808837,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.003998896572738886,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.082949697971344,
"adv/mean_abs_reasoning": 0.08451591432094574,
"adv/mean_abs_step_conf": 0.07661990821361542,
"adv/ratio_final_to_reasoning": 0.9814683854255649,
"adv/ratio_step_to_reasoning": 0.9065737361919134,
"adv/std_final_conf": 0.3310111463069916,
"adv/std_reasoning": 0.3308088481426239,
"adv/std_step_conf": 0.32838961482048035,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.94140625,
"calib/ece": 0.24833333333333332,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.0675,
"calib/mean_conf": 0.9050000000000001,
"calib/mu_c": 0.9275,
"calib/mu_w": 0.86,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.24333333333333332,
"calib/std_conf": 0.059651767227244266,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.82,
"calib/step_q_c_n": 4.0,
"calib/step_q_gap": 0.24735808720112507,
"calib/step_q_w": 0.5726419127988749,
"calib/step_q_w_n": 237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1189.0,
"completions/max_terminated_length": 1189.0,
"completions/mean_length": 190.85546875,
"completions/mean_terminated_length": 191.6039276123047,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.1472,
"grad_norm": 0.043301813304424286,
"kl": 0.2565765380859375,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.2439,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0022418617736548185,
"mask/share_reasoning": 0.8968058824539185,
"mask/share_step_conf": 0.09704601764678955,
"num_tokens": 28404773.0,
"reward": 0.011221060529351234,
"reward_std": 0.02459767647087574,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0175175778567791,
"rewards/format_reward_step": 0.0234375,
"rewards/step_l1_reward": -0.0028879554010927677,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.01932401955127716,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.019282005727291107,
"adv/ratio_final_to_reasoning": 0.5011307015218485,
"adv/ratio_step_to_reasoning": 0.5000411550622272,
"adv/std_final_conf": 0.16558970510959625,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.1652296781539917,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 0.25,
"calib/avg_num_step_conf": 0.9609375,
"calib/ece": 0.6266666666666667,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.44,
"calib/mean_conf": 0.29333333333333333,
"calib/mu_c": 0.0,
"calib/mu_w": 0.44,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.29333333333333333,
"calib/std_conf": 0.41483597829610785,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_w": 0.5567165311653116,
"calib/step_q_w_n": 246.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1581.0,
"completions/max_terminated_length": 1581.0,
"completions/mean_length": 172.96484375,
"completions/mean_terminated_length": 172.96484375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.011977670714259148,
"kl": 0.289459228515625,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0227,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.000581751111894846,
"mask/share_reasoning": 0.8981517553329468,
"mask/share_step_conf": 0.10126648843288422,
"num_tokens": 28552148.0,
"reward": 0.002332001691684127,
"reward_std": 0.006595896556973457,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.00390625,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0008047465817071497,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.07728341221809387,
"adv/mean_abs_reasoning": 0.11568251252174377,
"adv/mean_abs_step_conf": 0.07716748863458633,
"adv/ratio_final_to_reasoning": 0.6680647794848649,
"adv/ratio_step_to_reasoning": 0.6670626955831537,
"adv/std_final_conf": 0.3311251401901245,
"adv/std_reasoning": 0.4046950340270996,
"adv/std_step_conf": 0.33062857389450073,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.8500000000000001,
"calib/avg_num_step_conf": 0.95703125,
"calib/ece": 0.33,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.42857142857142855,
"calib/gap": 0.5309999999999999,
"calib/mean_conf": 0.6157142857142858,
"calib/mu_c": 0.995,
"calib/mu_w": 0.4640000000000001,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.33,
"calib/std_conf": 0.3848880038533341,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_w": 0.5250217687074831,
"calib/step_q_w_n": 245.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1276.0,
"completions/max_terminated_length": 1276.0,
"completions/mean_length": 181.58984375,
"completions/mean_terminated_length": 181.58984375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.06448958069086075,
"kl": 0.28741455078125,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.3303,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0017336525488644838,
"mask/share_reasoning": 0.8995578289031982,
"mask/share_step_conf": 0.09870850294828415,
"num_tokens": 28703651.0,
"reward": 0.006518370937556028,
"reward_std": 0.0184367373585701,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.012849219143390656,
"rewards/format_reward_step": 0.015625,
"rewards/step_l1_reward": -0.004499976523220539,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.0772620216012001,
"adv/mean_abs_reasoning": 0.09642931818962097,
"adv/mean_abs_step_conf": 0.07724109292030334,
"adv/ratio_final_to_reasoning": 0.8012295746950131,
"adv/ratio_step_to_reasoning": 0.8010125382035219,
"adv/std_final_conf": 0.33103352785110474,
"adv/std_reasoning": 0.3695387542247772,
"adv/std_step_conf": 0.33094385266304016,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.45,
"calib/avg_num_step_conf": 0.99609375,
"calib/ece": 0.6028571428571428,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": -0.16700000000000004,
"calib/mean_conf": 0.6142857142857144,
"calib/mu_c": 0.495,
"calib/mu_w": 0.662,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.46571428571428575,
"calib/std_conf": 0.38858193263113366,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.7,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.12838582677165356,
"calib/step_q_w": 0.5716141732283464,
"calib/step_q_w_n": 254.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2185.0,
"completions/max_terminated_length": 2185.0,
"completions/mean_length": 215.9375,
"completions/mean_terminated_length": 215.9375,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.1504,
"grad_norm": 0.036474522203207016,
"kl": 0.249755859375,
"learning_rate": 1.638888888888889e-06,
"loss": -0.2731,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0012745312415063381,
"mask/share_reasoning": 0.9076339602470398,
"mask/share_step_conf": 0.09109152853488922,
"num_tokens": 28866027.0,
"reward": 0.0038409747648984194,
"reward_std": 0.013156676664948463,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.010916406288743019,
"rewards/format_reward_step": 0.015625,
"rewards/step_l1_reward": -0.007921956479549408,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.06394700706005096,
"adv/mean_abs_reasoning": 0.08143529295921326,
"adv/mean_abs_step_conf": 0.05882935971021652,
"adv/ratio_final_to_reasoning": 0.7852493032975116,
"adv/ratio_step_to_reasoning": 0.7224061899019768,
"adv/std_final_conf": 0.28679755330085754,
"adv/std_reasoning": 0.33055466413497925,
"adv/std_step_conf": 0.2864469587802887,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 1.01953125,
"calib/ece": 0.16166666666666665,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.16666666666666666,
"calib/gap": 0.73,
"calib/mean_conf": 0.4583333333333333,
"calib/mu_c": 0.9450000000000001,
"calib/mu_w": 0.21500000000000002,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.91015625,
"calib/nonempty_step_conf_rate": 0.90234375,
"calib/pce": 0.14333333333333334,
"calib/std_conf": 0.3875743656240553,
"calib/step_conf_rate": 0.90234375,
"calib/step_q_c": 0.8,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.23791679487179485,
"calib/step_q_w": 0.5620832051282052,
"calib/step_q_w_n": 260.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2426.0,
"completions/max_terminated_length": 2426.0,
"completions/mean_length": 196.19921875,
"completions/mean_terminated_length": 197.7440948486328,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.05591246113181114,
"kl": 0.268585205078125,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.2564,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0022723714355379343,
"mask/share_reasoning": 0.8812129497528076,
"mask/share_step_conf": 0.10870218276977539,
"num_tokens": 29021414.0,
"reward": 0.007314523681998253,
"reward_std": 0.01666373386979103,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.01413437444716692,
"rewards/format_reward_step": 0.015625,
"rewards/step_l1_reward": -0.004192827269434929,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.05766920745372772,
"adv/mean_abs_reasoning": 0.07712167501449585,
"adv/mean_abs_step_conf": 0.05647395923733711,
"adv/ratio_final_to_reasoning": 0.7477691251245279,
"adv/ratio_step_to_reasoning": 0.7322709112155853,
"adv/std_final_conf": 0.28531643748283386,
"adv/std_reasoning": 0.33043211698532104,
"adv/std_step_conf": 0.279586523771286,
"calib/answer_extract_rate": 0.01953125,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.9296875,
"calib/ece": 0.542,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.2,
"calib/gap": 0.1975,
"calib/mean_conf": 0.742,
"calib/mu_c": 0.9,
"calib/mu_w": 0.7025,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.87109375,
"calib/nonempty_step_conf_rate": 0.86328125,
"calib/pce": 0.542,
"calib/std_conf": 0.2530138336138955,
"calib/step_conf_rate": 0.86328125,
"calib/step_q_w": 0.5040534313725491,
"calib/step_q_w_n": 238.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2342.0,
"completions/max_terminated_length": 2342.0,
"completions/mean_length": 191.19140625,
"completions/mean_terminated_length": 191.19140625,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.05209788307547569,
"kl": 0.2831268310546875,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.2249,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0014132431242614985,
"mask/share_reasoning": 0.89207923412323,
"mask/share_step_conf": 0.10650746524333954,
"num_tokens": 29177695.0,
"reward": 0.0009553575655445457,
"reward_std": 0.007277060765773058,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0031640625093132257,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.0043783471919596195,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.07675210386514664,
"adv/mean_abs_reasoning": 0.07714889943599701,
"adv/mean_abs_step_conf": 0.07728776335716248,
"adv/ratio_final_to_reasoning": 0.9948567565609986,
"adv/ratio_step_to_reasoning": 1.0017999468842802,
"adv/std_final_conf": 0.32887107133865356,
"adv/std_reasoning": 0.33054885268211365,
"adv/std_step_conf": 0.3311437666416168,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.91796875,
"calib/ece": 0.665,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.16666666666666666,
"calib/gap": -0.26200000000000007,
"calib/mean_conf": 0.6383333333333333,
"calib/mu_c": 0.42,
"calib/mu_w": 0.682,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.5683333333333334,
"calib/std_conf": 0.18968541207893544,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.0,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": -0.5767234620886981,
"calib/step_q_w": 0.5767234620886981,
"calib/step_q_w_n": 233.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2758.0,
"completions/max_terminated_length": 2758.0,
"completions/mean_length": 186.35546875,
"completions/mean_terminated_length": 186.35546875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.1536,
"grad_norm": 0.04337361454963684,
"kl": 0.285064697265625,
"learning_rate": 1.5555555555555558e-06,
"loss": -0.2841,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0018886120524257421,
"mask/share_reasoning": 0.8982242345809937,
"mask/share_step_conf": 0.09988721460103989,
"num_tokens": 29329530.0,
"reward": -0.0010638143867254257,
"reward_std": 0.00474740844219923,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.007497265934944153,
"rewards/format_reward_step": 0.015625,
"rewards/step_l1_reward": -0.013531144708395004,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.05796315148472786,
"adv/mean_abs_reasoning": 0.057868484407663345,
"adv/mean_abs_step_conf": 0.05754847824573517,
"adv/ratio_final_to_reasoning": 1.0016359004047457,
"adv/ratio_step_to_reasoning": 0.9944701133059949,
"adv/std_final_conf": 0.2867657244205475,
"adv/std_reasoning": 0.28629741072654724,
"adv/std_step_conf": 0.28471487760543823,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.6666666666666667,
"calib/avg_num_step_conf": 0.98046875,
"calib/ece": 0.49,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.1466666666666666,
"calib/mean_conf": 0.59,
"calib/mu_c": 0.7,
"calib/mu_w": 0.5533333333333333,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.41500000000000004,
"calib/std_conf": 0.26767517628648346,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.65,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.171516,
"calib/step_q_w": 0.478484,
"calib/step_q_w_n": 250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 808.0,
"completions/max_terminated_length": 808.0,
"completions/mean_length": 139.59375,
"completions/mean_terminated_length": 139.59375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.05382170528173447,
"kl": 0.33148193359375,
"learning_rate": 1.527777777777778e-06,
"loss": -0.1813,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.002252190839499235,
"mask/share_reasoning": 0.8799644112586975,
"mask/share_step_conf": 0.11778340488672256,
"num_tokens": 29467970.0,
"reward": 0.0059430343098938465,
"reward_std": 0.016809439286589622,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.009679296985268593,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.0009182285284623504,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.09656653553247452,
"adv/mean_abs_reasoning": 0.09645655006170273,
"adv/mean_abs_step_conf": 0.09653792530298233,
"adv/ratio_final_to_reasoning": 1.0011402592224317,
"adv/ratio_step_to_reasoning": 1.0008436466080068,
"adv/std_final_conf": 0.3700646162033081,
"adv/std_reasoning": 0.36964312195777893,
"adv/std_step_conf": 0.36995500326156616,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.55,
"calib/avg_num_step_conf": 0.91796875,
"calib/ece": 0.36714285714285705,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.026999999999999913,
"calib/mean_conf": 0.5157142857142858,
"calib/mu_c": 0.5349999999999999,
"calib/mu_w": 0.508,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.91015625,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.29857142857142854,
"calib/std_conf": 0.3107068964638604,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_c": 0.386,
"calib/step_q_c_n": 5.0,
"calib/step_q_gap": -0.15236942028985512,
"calib/step_q_w": 0.5383694202898551,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1154.0,
"completions/max_terminated_length": 1154.0,
"completions/mean_length": 196.71484375,
"completions/mean_terminated_length": 196.71484375,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.06925830245018005,
"kl": 0.254302978515625,
"learning_rate": 1.5e-06,
"loss": -0.3652,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0016984788235276937,
"mask/share_reasoning": 0.9018841981887817,
"mask/share_step_conf": 0.09641735255718231,
"num_tokens": 29625545.0,
"reward": 0.003927886951714754,
"reward_std": 0.015175402164459229,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.012025780975818634,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l1_reward": -0.009638756513595581,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.10098693519830704,
"adv/mean_abs_reasoning": 0.12002336233854294,
"adv/mean_abs_step_conf": 0.10241132974624634,
"adv/ratio_final_to_reasoning": 0.8413939855597367,
"adv/ratio_step_to_reasoning": 0.8532616296599044,
"adv/std_final_conf": 0.3682190775871277,
"adv/std_reasoning": 0.40489038825035095,
"adv/std_step_conf": 0.36968347430229187,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.7,
"calib/avg_num_step_conf": 0.9609375,
"calib/ece": 0.43125,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.375,
"calib/gap": 0.19933333333333336,
"calib/mean_conf": 0.60875,
"calib/mu_c": 0.7333333333333334,
"calib/mu_w": 0.534,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.3325,
"calib/std_conf": 0.3560701300305882,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.635,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.07624316939890718,
"calib/step_q_w": 0.5587568306010928,
"calib/step_q_w_n": 244.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 194.4140625,
"completions/mean_terminated_length": 195.17648315429688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1568,
"grad_norm": 0.04261239245533943,
"kl": 0.2639923095703125,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.4112,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0023956787772476673,
"mask/share_reasoning": 0.890393853187561,
"mask/share_step_conf": 0.10330420732498169,
"num_tokens": 29778995.0,
"reward": 0.005495469085872173,
"reward_std": 0.021126050502061844,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.01336992159485817,
"rewards/format_reward_step": 0.0234375,
"rewards/step_l1_reward": -0.009410234168171883,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.019322939217090607,
"adv/mean_abs_reasoning": 0.03858806565403938,
"adv/mean_abs_step_conf": 0.019289027899503708,
"adv/ratio_final_to_reasoning": 0.500749101816351,
"adv/ratio_step_to_reasoning": 0.499870298564306,
"adv/std_final_conf": 0.16558043658733368,
"adv/std_reasoning": 0.23381584882736206,
"adv/std_step_conf": 0.16528984904289246,
"calib/answer_extract_rate": 0.0078125,
"calib/auroc": 0.25,
"calib/avg_num_step_conf": 0.984375,
"calib/ece": 0.3666666666666666,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.19999999999999996,
"calib/mean_conf": 0.5666666666666667,
"calib/mu_c": 0.5,
"calib/mu_w": 0.7,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.1333333333333333,
"calib/std_conf": 0.18856180831641264,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_c": 0.8,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.2550377158034529,
"calib/step_q_w": 0.5449622841965471,
"calib/step_q_w_n": 251.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 962.0,
"completions/max_terminated_length": 962.0,
"completions/mean_length": 177.83203125,
"completions/mean_terminated_length": 177.83203125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.017495930194854736,
"kl": 0.280548095703125,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0763,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.000960962031967938,
"mask/share_reasoning": 0.8899896144866943,
"mask/share_step_conf": 0.10904946178197861,
"num_tokens": 29929632.0,
"reward": 0.002485139761120081,
"reward_std": 0.007029036059975624,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.003554687602445483,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0009281584643758833,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.09658879786729813,
"adv/mean_abs_reasoning": 0.11573696881532669,
"adv/mean_abs_step_conf": 0.09627419710159302,
"adv/ratio_final_to_reasoning": 0.8345544112306764,
"adv/ratio_step_to_reasoning": 0.831836172029103,
"adv/std_final_conf": 0.3701499104499817,
"adv/std_reasoning": 0.4048856496810913,
"adv/std_step_conf": 0.3689470887184143,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.96875,
"calib/ece": 0.38499999999999995,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.01953125,
"calib/frac_conf_gt_0.9": 0.375,
"calib/gap": 0.376,
"calib/mean_conf": 0.745,
"calib/mu_c": 0.98,
"calib/mu_w": 0.604,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.37749999999999995,
"calib/std_conf": 0.27013885318480196,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.865,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.3331924119241193,
"calib/step_q_w": 0.5318075880758807,
"calib/step_q_w_n": 246.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2090.0,
"completions/max_terminated_length": 2090.0,
"completions/mean_length": 193.4765625,
"completions/mean_terminated_length": 194.2353057861328,
"completions/min_length": 0.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.05413132533431053,
"kl": 0.2761993408203125,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.323,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0027247373946011066,
"mask/share_reasoning": 0.8881813287734985,
"mask/share_step_conf": 0.10518766939640045,
"num_tokens": 30083618.0,
"reward": 0.008374178782105446,
"reward_std": 0.0236857570707798,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.01495507825165987,
"rewards/format_reward_step": 0.01953125,
"rewards/step_l1_reward": -0.004456719383597374,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.03864767402410507,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.03861791640520096,
"adv/ratio_final_to_reasoning": 0.6681679569648148,
"adv/ratio_step_to_reasoning": 0.6676534864842654,
"adv/std_final_conf": 0.23417697846889496,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.2339966893196106,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.9765625,
"calib/ece": 0.38666666666666666,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/mean_conf": 0.38666666666666666,
"calib/mu_c": NaN,
"calib/mu_w": 0.38666666666666666,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.38666666666666666,
"calib/std_conf": 0.4259368758656876,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_w": 0.5733213333333333,
"calib/step_q_w_n": 250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2010.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 195.73828125,
"completions/mean_terminated_length": 195.73828125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.16,
"grad_norm": 0.03214937448501587,
"kl": 0.2613067626953125,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.1478,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0011974748922511935,
"mask/share_reasoning": 0.894477128982544,
"mask/share_step_conf": 0.10432544350624084,
"num_tokens": 30238687.0,
"reward": 0.0031226295977830887,
"reward_std": 0.008832130581140518,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.007685937453061342,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.003784427884966135,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.03864070400595665,
"adv/mean_abs_reasoning": 0.03861529380083084,
"adv/mean_abs_step_conf": 0.038446031510829926,
"adv/ratio_final_to_reasoning": 1.0006580347480163,
"adv/ratio_step_to_reasoning": 0.9956167032970425,
"adv/std_final_conf": 0.2341347485780716,
"adv/std_reasoning": 0.23398077487945557,
"adv/std_step_conf": 0.2329564392566681,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.6666666666666667,
"calib/avg_num_step_conf": 0.90234375,
"calib/ece": 0.422,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.2,
"calib/gap": 0.15000000000000008,
"calib/mean_conf": 0.4699999999999999,
"calib/mu_c": 0.56,
"calib/mu_w": 0.41,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.246,
"calib/std_conf": 0.35162480003549235,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_c": 0.835,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.3039599708879185,
"calib/step_q_w": 0.5310400291120815,
"calib/step_q_w_n": 229.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1884.0,
"completions/max_terminated_length": 1884.0,
"completions/mean_length": 212.734375,
"completions/mean_terminated_length": 212.734375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.018881434574723244,
"kl": 0.2248992919921875,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.054,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0012571100378409028,
"mask/share_reasoning": 0.9030216932296753,
"mask/share_step_conf": 0.09572114050388336,
"num_tokens": 30400171.0,
"reward": 0.004041813313961029,
"reward_std": 0.011431975290179253,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.006074219010770321,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0011155917309224606,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.05794394388794899,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.057304129004478455,
"adv/ratio_final_to_reasoning": 1.001775335352572,
"adv/ratio_step_to_reasoning": 0.9907137691828318,
"adv/std_final_conf": 0.2866707146167755,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.2835402190685272,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.9765625,
"calib/ece": 0.5714285714285714,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.14285714285714285,
"calib/mean_conf": 0.5714285714285714,
"calib/mu_c": NaN,
"calib/mu_w": 0.5714285714285714,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.5714285714285714,
"calib/std_conf": 0.26264743187282025,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_w": 0.47026093333333335,
"calib/step_q_w_n": 250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1148.0,
"completions/max_terminated_length": 1148.0,
"completions/mean_length": 183.16015625,
"completions/mean_terminated_length": 183.87844848632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.03377218544483185,
"kl": 0.319915771484375,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.1464,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0020097133237868547,
"mask/share_reasoning": 0.887736976146698,
"mask/share_step_conf": 0.10634706169366837,
"num_tokens": 30552452.0,
"reward": 0.0029946179129183292,
"reward_std": 0.010179774835705757,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.00791757833212614,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.004272093065083027,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.01932401955127716,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.01273763831704855,
"adv/ratio_final_to_reasoning": 0.5011307015218485,
"adv/ratio_step_to_reasoning": 0.33032576936780417,
"adv/std_final_conf": 0.16558970510959625,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.10915025323629379,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.9765625,
"calib/ece": 0.3233333333333333,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.465,
"calib/mean_conf": 0.59,
"calib/mu_c": 0.9,
"calib/mu_w": 0.435,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.29,
"calib/std_conf": 0.4173727350941841,
"calib/step_conf_rate": 0.90625,
"calib/step_q_w": 0.5264308,
"calib/step_q_w_n": 250.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2626.0,
"completions/max_terminated_length": 2626.0,
"completions/mean_length": 198.97265625,
"completions/mean_terminated_length": 198.97265625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.1632,
"grad_norm": 0.005113726481795311,
"kl": 0.2578277587890625,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0499,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0007983995601534843,
"mask/share_reasoning": 0.9040226936340332,
"mask/share_step_conf": 0.09517890214920044,
"num_tokens": 30710709.0,
"reward": 0.002732241991907358,
"reward_std": 0.00772794708609581,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.00390625,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -4.266354608262191e-06,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.11585914343595505,
"adv/mean_abs_reasoning": 0.13501739501953125,
"adv/mean_abs_step_conf": 0.1158389151096344,
"adv/ratio_final_to_reasoning": 0.8581053087211109,
"adv/ratio_step_to_reasoning": 0.8579554885715093,
"adv/std_final_conf": 0.4053131937980652,
"adv/std_reasoning": 0.4372970759868622,
"adv/std_step_conf": 0.40524232387542725,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.7,
"calib/avg_num_step_conf": 0.9296875,
"calib/ece": 0.44999999999999996,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.125,
"calib/gap": 0.11599999999999999,
"calib/mean_conf": 0.6675,
"calib/mu_c": 0.7399999999999999,
"calib/mu_w": 0.6239999999999999,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.37124999999999997,
"calib/std_conf": 0.2963001012487171,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.5,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": -0.022537288135593192,
"calib/step_q_w": 0.5225372881355932,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 202.93359375,
"completions/mean_terminated_length": 202.93359375,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.059825729578733444,
"kl": 0.283966064453125,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.4398,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0027232125867158175,
"mask/share_reasoning": 0.8968022465705872,
"mask/share_step_conf": 0.1004745364189148,
"num_tokens": 30867100.0,
"reward": 0.005700921639800072,
"reward_std": 0.021998731419444084,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.015060937032103539,
"rewards/format_reward_step": 0.0234375,
"rewards/step_l1_reward": -0.010690344497561455,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.025226594880223274,
"adv/mean_abs_reasoning": 0.044541243463754654,
"adv/mean_abs_step_conf": 0.02512870728969574,
"adv/ratio_final_to_reasoning": 0.5663648546487339,
"adv/ratio_step_to_reasoning": 0.564167170369731,
"adv/std_final_conf": 0.16532482206821442,
"adv/std_reasoning": 0.23372870683670044,
"adv/std_step_conf": 0.1656012386083603,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.9296875,
"calib/ece": 0.7775,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.41,
"calib/mean_conf": 0.7475,
"calib/mu_c": 0.44,
"calib/mu_w": 0.85,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.9140625,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.6375,
"calib/std_conf": 0.18592673288153053,
"calib/step_conf_rate": 0.90625,
"calib/step_q_w": 0.5171634453781513,
"calib/step_q_w_n": 238.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 742.0,
"completions/max_terminated_length": 742.0,
"completions/mean_length": 151.33984375,
"completions/mean_terminated_length": 151.933349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.01697084680199623,
"kl": 0.331878662109375,
"learning_rate": 1.25e-06,
"loss": -0.072,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0013132116291671991,
"mask/share_reasoning": 0.8803766369819641,
"mask/share_step_conf": 0.1144038736820221,
"num_tokens": 31013059.0,
"reward": -0.0014306087978184223,
"reward_std": 0.004541726782917976,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0015542968176305294,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0067592645063996315,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.019319972023367882,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.019240034744143486,
"adv/ratio_final_to_reasoning": 1.0020514735830872,
"adv/ratio_step_to_reasoning": 0.9979054391921397,
"adv/std_final_conf": 0.16555501520633698,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.164870023727417,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.9453125,
"calib/ece": 0.5366666666666666,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/mean_conf": 0.5366666666666666,
"calib/mu_c": NaN,
"calib/mu_w": 0.5366666666666666,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.5366666666666666,
"calib/std_conf": 0.3472111109333276,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_w": 0.5298030303030303,
"calib/step_q_w_n": 242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1701.0,
"completions/max_terminated_length": 1701.0,
"completions/mean_length": 170.19921875,
"completions/mean_terminated_length": 170.19921875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.1664,
"grad_norm": 0.023405877873301506,
"kl": 0.296722412109375,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0548,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0011736187152564526,
"mask/share_reasoning": 0.8808482885360718,
"mask/share_step_conf": 0.11797812581062317,
"num_tokens": 31161390.0,
"reward": 0.00159166450612247,
"reward_std": 0.004501907154917717,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.002850000048056245,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0004479209310375154,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.03864702582359314,
"adv/mean_abs_reasoning": 0.08310207724571228,
"adv/mean_abs_step_conf": 0.038230299949645996,
"adv/ratio_final_to_reasoning": 0.46505487112341903,
"adv/ratio_step_to_reasoning": 0.46004024468135085,
"adv/std_final_conf": 0.23417307436466217,
"adv/std_reasoning": 0.3304872214794159,
"adv/std_step_conf": 0.23165775835514069,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.6666666666666666,
"calib/avg_num_step_conf": 0.90625,
"calib/ece": 0.398,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.17666666666666664,
"calib/mean_conf": 0.306,
"calib/mu_c": 0.37666666666666665,
"calib/mu_w": 0.2,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.90234375,
"calib/nonempty_step_conf_rate": 0.88671875,
"calib/pce": 0.052000000000000005,
"calib/std_conf": 0.17884071124886525,
"calib/step_conf_rate": 0.88671875,
"calib/step_q_w": 0.546862643678161,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1117.0,
"completions/max_terminated_length": 1117.0,
"completions/mean_length": 165.22265625,
"completions/mean_terminated_length": 165.22265625,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.02955986186861992,
"kl": 0.29241943359375,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.1381,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0010385285131633282,
"mask/share_reasoning": 0.880927324295044,
"mask/share_step_conf": 0.11803416907787323,
"num_tokens": 31307415.0,
"reward": 0.005156665109097958,
"reward_std": 0.01382213644683361,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.007471875287592411,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0010647946037352085,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.05792781710624695,
"adv/mean_abs_reasoning": 0.07714889943599701,
"adv/mean_abs_step_conf": 0.057842917740345,
"adv/ratio_final_to_reasoning": 0.750857336005215,
"adv/ratio_step_to_reasoning": 0.7497568748641927,
"adv/std_final_conf": 0.28659093379974365,
"adv/std_reasoning": 0.33054885268211365,
"adv/std_step_conf": 0.28617167472839355,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.9765625,
"calib/ece": 0.5442857142857144,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.14285714285714285,
"calib/gap": -0.03500000000000003,
"calib/mean_conf": 0.5700000000000001,
"calib/mu_c": 0.545,
"calib/mu_w": 0.5800000000000001,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.41428571428571426,
"calib/std_conf": 0.2770250117640231,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.58,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.04874899598393567,
"calib/step_q_w": 0.5312510040160643,
"calib/step_q_w_n": 249.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1476.0,
"completions/max_terminated_length": 1476.0,
"completions/mean_length": 171.08984375,
"completions/mean_terminated_length": 171.08984375,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.044954050332307816,
"kl": 0.348052978515625,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.1857,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0017155336681753397,
"mask/share_reasoning": 0.885796070098877,
"mask/share_step_conf": 0.11248837411403656,
"num_tokens": 31456454.0,
"reward": 0.0025267673190683126,
"reward_std": 0.009554600343108177,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.005995702929794788,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.004848418291658163,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.038645416498184204,
"adv/mean_abs_reasoning": 0.03858806565403938,
"adv/mean_abs_step_conf": 0.03852042183279991,
"adv/ratio_final_to_reasoning": 1.0014862326777143,
"adv/ratio_step_to_reasoning": 0.9982470274139696,
"adv/std_final_conf": 0.234163299202919,
"adv/std_reasoning": 0.23381583392620087,
"adv/std_step_conf": 0.23340687155723572,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.96484375,
"calib/ece": 0.35,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.25500000000000006,
"calib/mean_conf": 0.38999999999999996,
"calib/mu_c": 0.56,
"calib/mu_w": 0.305,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.2033333333333333,
"calib/std_conf": 0.26919633479426625,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 1.0,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.44775632653061226,
"calib/step_q_w": 0.5522436734693877,
"calib/step_q_w_n": 245.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1068.0,
"completions/max_terminated_length": 1068.0,
"completions/mean_length": 167.48828125,
"completions/mean_terminated_length": 167.48828125,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.1696,
"grad_norm": 0.029850637540221214,
"kl": 0.308319091796875,
"learning_rate": 1.138888888888889e-06,
"loss": -0.13,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0015966113423928618,
"mask/share_reasoning": 0.8897839784622192,
"mask/share_step_conf": 0.10861947387456894,
"num_tokens": 31604115.0,
"reward": 0.0034665153361856937,
"reward_std": 0.009804786182940006,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0070558590814471245,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0024665785022079945,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.05790805071592331,
"adv/mean_abs_reasoning": 0.09645655006170273,
"adv/mean_abs_step_conf": 0.05781654268503189,
"adv/ratio_final_to_reasoning": 0.6003537414398488,
"adv/ratio_step_to_reasoning": 0.5994050445308998,
"adv/std_final_conf": 0.28649330139160156,
"adv/std_reasoning": 0.36964312195777893,
"adv/std_step_conf": 0.28604114055633545,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.95703125,
"calib/ece": 0.5,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.16666666666666666,
"calib/gap": -0.015000000000000013,
"calib/mean_conf": 0.52,
"calib/mu_c": 0.515,
"calib/mu_w": 0.53,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.17666666666666667,
"calib/std_conf": 0.26695817400234567,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.645,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.14981111111111106,
"calib/step_q_w": 0.49518888888888896,
"calib/step_q_w_n": 243.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1638.0,
"completions/max_terminated_length": 1638.0,
"completions/mean_length": 178.32421875,
"completions/mean_terminated_length": 178.32421875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.03517092391848564,
"kl": 0.298797607421875,
"learning_rate": 1.111111111111111e-06,
"loss": -0.2384,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0011997153051197529,
"mask/share_reasoning": 0.8991638422012329,
"mask/share_step_conf": 0.09963646531105042,
"num_tokens": 31754606.0,
"reward": 0.003261414123699069,
"reward_std": 0.01592809334397316,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0059742191806435585,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.004920140374451876,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.0193235632032156,
"adv/mean_abs_reasoning": 0.06382165849208832,
"adv/mean_abs_step_conf": 0.019290367141366005,
"adv/ratio_final_to_reasoning": 0.3027743819225734,
"adv/ratio_step_to_reasoning": 0.3022542440472202,
"adv/std_final_conf": 0.16558578610420227,
"adv/std_reasoning": 0.2862262427806854,
"adv/std_step_conf": 0.16530132293701172,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.9296875,
"calib/ece": 0.20666666666666667,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.39000000000000007,
"calib/mean_conf": 0.46,
"calib/mu_c": 0.5900000000000001,
"calib/mu_w": 0.2,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.0,
"calib/std_conf": 0.31283648551066845,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_w": 0.5152964985994398,
"calib/step_q_w_n": 238.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1412.0,
"completions/max_terminated_length": 1412.0,
"completions/mean_length": 173.3828125,
"completions/mean_terminated_length": 173.3828125,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.020821698009967804,
"kl": 0.2965240478515625,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0847,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.00027795127243734896,
"mask/share_reasoning": 0.8859891891479492,
"mask/share_step_conf": 0.11373290419578552,
"num_tokens": 31902912.0,
"reward": 0.0029594521038234234,
"reward_std": 0.007607479579746723,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.0037499999161809683,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.000956095929723233,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.038639381527900696,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.038621000945568085,
"adv/ratio_final_to_reasoning": 0.6680245905039106,
"adv/ratio_step_to_reasoning": 0.6677068141705317,
"adv/std_final_conf": 0.2341267615556717,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.23401540517807007,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.8,
"calib/avg_num_step_conf": 0.95703125,
"calib/ece": 0.30833333333333335,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.238,
"calib/mean_conf": 0.3116666666666667,
"calib/mu_c": 0.51,
"calib/mu_w": 0.272,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.22666666666666668,
"calib/std_conf": 0.21145658866275338,
"calib/step_conf_rate": 0.9375,
"calib/step_q_w": 0.560511700680272,
"calib/step_q_w_n": 245.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 914.0,
"completions/max_terminated_length": 914.0,
"completions/mean_length": 153.421875,
"completions/mean_terminated_length": 154.02354431152344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.1728,
"grad_norm": 0.0327790267765522,
"kl": 0.328704833984375,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.1476,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.001432921038940549,
"mask/share_reasoning": 0.8785004019737244,
"mask/share_step_conf": 0.11616045236587524,
"num_tokens": 32046332.0,
"reward": 0.001956491032615304,
"reward_std": 0.0058908602222800255,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.005937109235674143,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.00436787772923708,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.057446904480457306,
"adv/mean_abs_reasoning": 0.1024097204208374,
"adv/mean_abs_step_conf": 0.05789678916335106,
"adv/ratio_final_to_reasoning": 0.5609516776765707,
"adv/ratio_step_to_reasoning": 0.5653446657742339,
"adv/std_final_conf": 0.2842351794242859,
"adv/std_reasoning": 0.3695880174636841,
"adv/std_step_conf": 0.2864377796649933,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.953125,
"calib/ece": 0.55975,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": -0.11949999999999994,
"calib/mean_conf": 0.45974999999999994,
"calib/mu_c": 0.4,
"calib/mu_w": 0.5195,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.25975,
"calib/std_conf": 0.3782263707093941,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.08,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": -0.44900109739369004,
"calib/step_q_w": 0.5290010973936901,
"calib/step_q_w_n": 243.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 997.0,
"completions/max_terminated_length": 997.0,
"completions/mean_length": 155.38671875,
"completions/mean_terminated_length": 155.38671875,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.06696376949548721,
"kl": 0.336578369140625,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.2386,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0017463011899963021,
"mask/share_reasoning": 0.8830251693725586,
"mask/share_step_conf": 0.11522849649190903,
"num_tokens": 32190943.0,
"reward": 0.00286776851862669,
"reward_std": 0.007197332568466663,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.007374605629593134,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.007107818499207497,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.03864771127700806,
"adv/mean_abs_reasoning": 0.04287446290254593,
"adv/mean_abs_step_conf": 0.03591509163379669,
"adv/ratio_final_to_reasoning": 0.9014156367358975,
"adv/ratio_step_to_reasoning": 0.837680269381614,
"adv/std_final_conf": 0.23417723178863525,
"adv/std_reasoning": 0.23382404446601868,
"adv/std_step_conf": 0.21824036538600922,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.96875,
"calib/ece": 0.2386666666666667,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.4,
"calib/gap": 0.5972222222222222,
"calib/mean_conf": 0.6266666666666667,
"calib/mu_c": 0.985,
"calib/mu_w": 0.3877777777777778,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.2326666666666667,
"calib/std_conf": 0.3324321150417196,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.99,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.4862210526315789,
"calib/step_q_w": 0.5037789473684211,
"calib/step_q_w_n": 247.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 813.0,
"completions/max_terminated_length": 813.0,
"completions/mean_length": 160.5,
"completions/mean_terminated_length": 160.5,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.02655690908432007,
"kl": 0.32733154296875,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.1412,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0013986306730657816,
"mask/share_reasoning": 0.8904591798782349,
"mask/share_step_conf": 0.1081421747803688,
"num_tokens": 32338167.0,
"reward": 0.004336560145020485,
"reward_std": 0.011081664822995663,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.007699218578636646,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.002151099033653736,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.057930298149585724,
"adv/mean_abs_reasoning": 0.07717613130807877,
"adv/mean_abs_step_conf": 0.057131461799144745,
"adv/ratio_final_to_reasoning": 0.7506245411335046,
"adv/ratio_step_to_reasoning": 0.7402737197473935,
"adv/std_final_conf": 0.28660330176353455,
"adv/std_reasoning": 0.3306655287742615,
"adv/std_step_conf": 0.2827037274837494,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.93359375,
"calib/ece": 0.37,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.2,
"calib/gap": 0.10833333333333328,
"calib/mean_conf": 0.43,
"calib/mu_c": 0.495,
"calib/mu_w": 0.3866666666666667,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.9375,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.2,
"calib/std_conf": 0.38481164223552283,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.545,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.04295541490857946,
"calib/step_q_w": 0.5020445850914206,
"calib/step_q_w_n": 237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1105.0,
"completions/max_terminated_length": 1105.0,
"completions/mean_length": 184.875,
"completions/mean_terminated_length": 184.875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.176,
"grad_norm": 0.04415847733616829,
"kl": 0.28472900390625,
"learning_rate": 9.722222222222224e-07,
"loss": -0.2662,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0005913099739700556,
"mask/share_reasoning": 0.8977833986282349,
"mask/share_step_conf": 0.10162527859210968,
"num_tokens": 32491071.0,
"reward": 0.004669106099754572,
"reward_std": 0.014850882813334465,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.008543359115719795,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.0038926471024751663,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.1159270703792572,
"adv/mean_abs_reasoning": 0.15429779887199402,
"adv/mean_abs_step_conf": 0.11581672728061676,
"adv/ratio_final_to_reasoning": 0.7513203119341365,
"adv/ratio_step_to_reasoning": 0.7506051811970352,
"adv/std_final_conf": 0.40555065870285034,
"adv/std_reasoning": 0.46746668219566345,
"adv/std_step_conf": 0.4051646888256073,
"calib/answer_extract_rate": 0.03515625,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.90234375,
"calib/ece": 0.38999999999999996,
"calib/final_conf_rate": 0.03515625,
"calib/format_rate": 0.0234375,
"calib/frac_conf_gt_0.9": 0.1111111111111111,
"calib/gap": 0.22449999999999992,
"calib/mean_conf": 0.5077777777777777,
"calib/mu_c": 0.6325,
"calib/mu_w": 0.40800000000000003,
"calib/nonempty_final_conf_rate": 0.03515625,
"calib/nonempty_reasoning_rate": 0.89453125,
"calib/nonempty_step_conf_rate": 0.8828125,
"calib/pce": 0.22666666666666666,
"calib/std_conf": 0.26080123456497917,
"calib/step_conf_rate": 0.8828125,
"calib/step_q_c": 0.73,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": 0.16625196506550222,
"calib/step_q_w": 0.5637480349344978,
"calib/step_q_w_n": 229.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1364.0,
"completions/max_terminated_length": 1364.0,
"completions/mean_length": 199.70703125,
"completions/mean_terminated_length": 200.49020385742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.06651771813631058,
"kl": 0.244354248046875,
"learning_rate": 9.444444444444445e-07,
"loss": -0.3756,
"mask/has_final_conf_rate": 0.03515625,
"mask/share_final_conf": 0.0025812385138124228,
"mask/share_reasoning": 0.8953035473823547,
"mask/share_step_conf": 0.0982089564204216,
"num_tokens": 32648380.0,
"reward": 0.008751096203923225,
"reward_std": 0.02475183829665184,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.01907304674386978,
"rewards/format_reward_step": 0.0234375,
"rewards/step_l1_reward": -0.00938335433602333,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.057971835136413574,
"adv/mean_abs_reasoning": 0.06382165849208832,
"adv/mean_abs_step_conf": 0.057953860610723495,
"adv/ratio_final_to_reasoning": 0.9083410946395272,
"adv/ratio_step_to_reasoning": 0.9080594578705249,
"adv/std_final_conf": 0.28680866956710815,
"adv/std_reasoning": 0.2862262427806854,
"adv/std_step_conf": 0.28671973943710327,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.9140625,
"calib/ece": 0.07853333333333333,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.16666666666666666,
"calib/gap": 0.89776,
"calib/mean_conf": 0.2418666666666667,
"calib/mu_c": 0.99,
"calib/mu_w": 0.09224000000000002,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.07686666666666667,
"calib/std_conf": 0.3421813684128475,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_w": 0.557118660968661,
"calib/step_q_w_n": 234.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1585.0,
"completions/max_terminated_length": 1585.0,
"completions/mean_length": 188.90625,
"completions/mean_terminated_length": 188.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.04468516632914543,
"kl": 0.289154052734375,
"learning_rate": 9.166666666666666e-07,
"loss": -0.207,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0017223083414137363,
"mask/share_reasoning": 0.9020618200302124,
"mask/share_step_conf": 0.0962158814072609,
"num_tokens": 32802348.0,
"reward": 0.003114057704806328,
"reward_std": 0.008013888262212276,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.011639062315225601,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.00853594671934843,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.038645610213279724,
"adv/mean_abs_reasoning": 0.10238249599933624,
"adv/mean_abs_step_conf": 0.03857024013996124,
"adv/ratio_final_to_reasoning": 0.37746305983329675,
"adv/ratio_step_to_reasoning": 0.37672689812339893,
"adv/std_final_conf": 0.23416449129581451,
"adv/std_reasoning": 0.36948361992836,
"adv/std_step_conf": 0.23370780050754547,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.6666666666666667,
"calib/avg_num_step_conf": 0.93359375,
"calib/ece": 0.2557142857142858,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.42857142857142855,
"calib/gap": 0.35250000000000004,
"calib/mean_conf": 0.7014285714285713,
"calib/mu_c": 0.8525,
"calib/mu_w": 0.5,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.1928571428571429,
"calib/std_conf": 0.3081942138804487,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_w": 0.5231748953974896,
"calib/step_q_w_n": 239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 804.0,
"completions/max_terminated_length": 804.0,
"completions/mean_length": 174.734375,
"completions/mean_terminated_length": 175.41961669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.1792,
"grad_norm": 0.032676585018634796,
"kl": 0.303375244140625,
"learning_rate": 8.88888888888889e-07,
"loss": -0.1472,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0014493621420115232,
"mask/share_reasoning": 0.895774781703949,
"mask/share_step_conf": 0.09886964410543442,
"num_tokens": 32951752.0,
"reward": 0.005033358000218868,
"reward_std": 0.013473371975123882,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.0070917969569563866,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0017125809099525213,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.06294244527816772,
"adv/mean_abs_reasoning": 0.10074294358491898,
"adv/mean_abs_step_conf": 0.06367062032222748,
"adv/ratio_final_to_reasoning": 0.6247826700151144,
"adv/ratio_step_to_reasoning": 0.6320107201211346,
"adv/std_final_conf": 0.28657039999961853,
"adv/std_reasoning": 0.3696483075618744,
"adv/std_step_conf": 0.2865627706050873,
"calib/answer_extract_rate": 0.03515625,
"calib/auroc": 0.45833333333333337,
"calib/avg_num_step_conf": 0.9296875,
"calib/ece": 0.505,
"calib/final_conf_rate": 0.0390625,
"calib/format_rate": 0.015625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.06666666666666665,
"calib/mean_conf": 0.525,
"calib/mu_c": 0.485,
"calib/mu_w": 0.5516666666666666,
"calib/nonempty_final_conf_rate": 0.0390625,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.88671875,
"calib/pce": 0.315,
"calib/std_conf": 0.29282247181526216,
"calib/step_conf_rate": 0.88671875,
"calib/step_q_c": 0.525,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": -0.007378389830508425,
"calib/step_q_w": 0.5323783898305084,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2561.0,
"completions/max_terminated_length": 2561.0,
"completions/mean_length": 201.8359375,
"completions/mean_terminated_length": 202.62745666503906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.031436190009117126,
"kl": 0.26751708984375,
"learning_rate": 8.611111111111112e-07,
"loss": -0.2209,
"mask/has_final_conf_rate": 0.0390625,
"mask/share_final_conf": 0.0018767904257401824,
"mask/share_reasoning": 0.8958785533905029,
"mask/share_step_conf": 0.09833839535713196,
"num_tokens": 33107606.0,
"reward": 0.005335357505828142,
"reward_std": 0.012339570559561253,
"rewards/accuracy_reward_step": 0.015625,
"rewards/final_brier_reward_step": 0.010004688054323196,
"rewards/format_reward_step": 0.015625,
"rewards/step_l1_reward": -0.005583972670137882,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.03864695504307747,
"adv/mean_abs_reasoning": 0.07714889943599701,
"adv/mean_abs_step_conf": 0.03758513927459717,
"adv/ratio_final_to_reasoning": 0.5009398102320192,
"adv/ratio_step_to_reasoning": 0.4871766097684638,
"adv/std_final_conf": 0.23417262732982635,
"adv/std_reasoning": 0.33054885268211365,
"adv/std_step_conf": 0.22782209515571594,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.6666666666666666,
"calib/avg_num_step_conf": 0.94921875,
"calib/ece": 0.4,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": 0.03499999999999992,
"calib/mean_conf": 0.5599999999999999,
"calib/mu_c": 0.58,
"calib/mu_w": 0.545,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.2657142857142857,
"calib/std_conf": 0.28784916685156975,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.99,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.462343153526971,
"calib/step_q_w": 0.527656846473029,
"calib/step_q_w_n": 241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1106.0,
"completions/max_terminated_length": 1106.0,
"completions/mean_length": 165.04296875,
"completions/mean_terminated_length": 165.04296875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.03978708013892174,
"kl": 0.311492919921875,
"learning_rate": 8.333333333333333e-07,
"loss": -0.1579,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0013750765938311815,
"mask/share_reasoning": 0.8882984519004822,
"mask/share_step_conf": 0.11032651364803314,
"num_tokens": 33254009.0,
"reward": 0.005037762224674225,
"reward_std": 0.014248941093683243,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.007459375075995922,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0012901013251394033,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.057969287037849426,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.05761338770389557,
"adv/ratio_final_to_reasoning": 1.0022134854125593,
"adv/ratio_step_to_reasoning": 0.9960604493797887,
"adv/std_final_conf": 0.28679606318473816,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.2850436270236969,
"calib/answer_extract_rate": 0.01953125,
"calib/avg_num_step_conf": 0.92578125,
"calib/ece": 0.3583333333333333,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.35833333333333334,
"calib/mu_c": NaN,
"calib/mu_w": 0.35833333333333334,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.3583333333333333,
"calib/std_conf": 0.19186945793661087,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_w": 0.5368689170182841,
"calib/step_q_w_n": 237.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 921.0,
"completions/max_terminated_length": 921.0,
"completions/mean_length": 166.6640625,
"completions/mean_terminated_length": 166.6640625,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.1824,
"grad_norm": 0.04051095247268677,
"kl": 0.2969970703125,
"learning_rate": 8.055555555555557e-07,
"loss": -0.2421,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0008004299597814679,
"mask/share_reasoning": 0.8912744522094727,
"mask/share_step_conf": 0.10792511701583862,
"num_tokens": 33403571.0,
"reward": 0.004929243121296167,
"reward_std": 0.01394200511276722,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.010848437435925007,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.0033337008208036423,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.0579204298555851,
"adv/mean_abs_reasoning": 0.11568251252174377,
"adv/mean_abs_step_conf": 0.05779913812875748,
"adv/ratio_final_to_reasoning": 0.5006844041764595,
"adv/ratio_step_to_reasoning": 0.49963591617093817,
"adv/std_final_conf": 0.2865545153617859,
"adv/std_reasoning": 0.4046950340270996,
"adv/std_step_conf": 0.28595492243766785,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.9453125,
"calib/ece": 0.3814285714285714,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.14285714285714285,
"calib/gap": 0.22333333333333333,
"calib/mean_conf": 0.5157142857142857,
"calib/mu_c": 0.6433333333333333,
"calib/mu_w": 0.42,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.23428571428571426,
"calib/std_conf": 0.3290617459582034,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_w": 0.5292363636363636,
"calib/step_q_w_n": 242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 979.0,
"completions/max_terminated_length": 979.0,
"completions/mean_length": 165.16015625,
"completions/mean_terminated_length": 165.80784606933594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.05485696345567703,
"kl": 0.28277587890625,
"learning_rate": 7.777777777777779e-07,
"loss": -0.2087,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0016495675081387162,
"mask/share_reasoning": 0.8820877075195312,
"mask/share_step_conf": 0.11235648393630981,
"num_tokens": 33549204.0,
"reward": 0.004013527184724808,
"reward_std": 0.011351969093084335,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.007190625183284283,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.003851071000099182,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.03862863779067993,
"adv/mean_abs_reasoning": 0.06382165849208832,
"adv/mean_abs_step_conf": 0.03851080313324928,
"adv/ratio_final_to_reasoning": 0.6052590719726995,
"adv/ratio_step_to_reasoning": 0.6034127605446558,
"adv/std_final_conf": 0.23406165838241577,
"adv/std_reasoning": 0.2862262427806854,
"adv/std_step_conf": 0.23334872722625732,
"calib/answer_extract_rate": 0.01953125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.921875,
"calib/ece": 0.20400000000000004,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.4,
"calib/gap": 0.6566666666666667,
"calib/mean_conf": 0.5960000000000001,
"calib/mu_c": 0.99,
"calib/mu_w": 0.3333333333333333,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.20000000000000004,
"calib/std_conf": 0.41572106032771544,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_w": 0.5427545197740112,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1437.0,
"completions/max_terminated_length": 1437.0,
"completions/mean_length": 180.21875,
"completions/mean_terminated_length": 180.21875,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.04185887426137924,
"kl": 0.29754638671875,
"learning_rate": 7.5e-07,
"loss": -0.1567,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.001149651245214045,
"mask/share_reasoning": 0.8883706331253052,
"mask/share_step_conf": 0.1104796975851059,
"num_tokens": 33698500.0,
"reward": 0.0030319697689265013,
"reward_std": 0.007411236874759197,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0053125000558793545,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0023735607974231243,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.038619525730609894,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.0386103130877018,
"adv/ratio_final_to_reasoning": 0.667681309624926,
"adv/ratio_step_to_reasoning": 0.6675220350257274,
"adv/std_final_conf": 0.23400649428367615,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.23395071923732758,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.6666666666666667,
"calib/avg_num_step_conf": 0.9453125,
"calib/ece": 0.31999999999999995,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.3533333333333334,
"calib/mean_conf": 0.405,
"calib/mu_c": 0.67,
"calib/mu_w": 0.31666666666666665,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.2375,
"calib/std_conf": 0.3623879137057416,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_w": 0.5508133608815428,
"calib/step_q_w_n": 242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1087.0,
"completions/max_terminated_length": 1087.0,
"completions/mean_length": 181.86328125,
"completions/mean_terminated_length": 182.57647705078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.1856,
"grad_norm": 0.03075559437274933,
"kl": 0.2668304443359375,
"learning_rate": 7.222222222222222e-07,
"loss": -0.1588,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0006882546003907919,
"mask/share_reasoning": 0.8974344730377197,
"mask/share_step_conf": 0.09797105193138123,
"num_tokens": 33849289.0,
"reward": 0.0014810014981776476,
"reward_std": 0.0041889045387506485,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.004951172042638063,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.004332918673753738,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.88671875,
"calib/ece": 0.43999999999999995,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.2400000000000001,
"calib/mean_conf": 0.64,
"calib/mu_c": 0.8,
"calib/mu_w": 0.5599999999999999,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.8984375,
"calib/nonempty_step_conf_rate": 0.88671875,
"calib/pce": 0.3733333333333333,
"calib/std_conf": 0.16083117442419761,
"calib/step_conf_rate": 0.88671875,
"calib/step_q_w": 0.46687679882525696,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2529.0,
"completions/max_terminated_length": 2529.0,
"completions/mean_length": 188.6875,
"completions/mean_terminated_length": 188.6875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.003241309430450201,
"kl": 0.2762603759765625,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0061,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.00039259361801669,
"mask/share_reasoning": 0.8896238207817078,
"mask/share_step_conf": 0.10998360067605972,
"num_tokens": 34003417.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0234375,
"calib/avg_num_step_conf": 0.88671875,
"calib/ece": 0.42166666666666663,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.4216666666666667,
"calib/mu_c": NaN,
"calib/mu_w": 0.4216666666666667,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.8828125,
"calib/pce": 0.42166666666666663,
"calib/std_conf": 0.34503220461606515,
"calib/step_conf_rate": 0.8828125,
"calib/step_q_w": 0.5479162995594714,
"calib/step_q_w_n": 227.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3017.0,
"completions/max_terminated_length": 3017.0,
"completions/mean_length": 209.3828125,
"completions/mean_terminated_length": 209.3828125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.002791334642097354,
"kl": 0.24755859375,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0212,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0006819250411354005,
"mask/share_reasoning": 0.9036825299263,
"mask/share_step_conf": 0.09563553333282471,
"num_tokens": 34161083.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.019307058304548264,
"adv/mean_abs_reasoning": 0.03858806565403938,
"adv/mean_abs_step_conf": 0.019321506842970848,
"adv/ratio_final_to_reasoning": 0.500337551968667,
"adv/ratio_step_to_reasoning": 0.5007119822018931,
"adv/std_final_conf": 0.16544435918331146,
"adv/std_reasoning": 0.23381584882736206,
"adv/std_step_conf": 0.16556817293167114,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.9375,
"calib/ece": 0.765,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.53,
"calib/mean_conf": 0.485,
"calib/mu_c": 0.22,
"calib/mu_w": 0.75,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.375,
"calib/std_conf": 0.265,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.14,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": -0.3586164574616457,
"calib/step_q_w": 0.49861645746164573,
"calib/step_q_w_n": 239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1485.0,
"completions/max_terminated_length": 1485.0,
"completions/mean_length": 168.48828125,
"completions/mean_terminated_length": 168.48828125,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1888,
"grad_norm": 0.024655012413859367,
"kl": 0.273468017578125,
"learning_rate": 6.388888888888889e-07,
"loss": -0.0691,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0005021016113460064,
"mask/share_reasoning": 0.8795663118362427,
"mask/share_step_conf": 0.11993157863616943,
"num_tokens": 34308048.0,
"reward": 0.0003487913345452398,
"reward_std": 0.0012231777654960752,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.00152968754991889,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.003175854915753007,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.03864792734384537,
"adv/mean_abs_reasoning": 0.057868484407663345,
"adv/mean_abs_step_conf": 0.03859715908765793,
"adv/ratio_final_to_reasoning": 0.6678579496152718,
"adv/ratio_step_to_reasoning": 0.6669806455575088,
"adv/std_final_conf": 0.23417851328849792,
"adv/std_reasoning": 0.28629738092422485,
"adv/std_step_conf": 0.23387092351913452,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 1.01953125,
"calib/ece": 0.14600000000000002,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.2,
"calib/gap": 0.81,
"calib/mean_conf": 0.34199999999999997,
"calib/mu_c": 0.99,
"calib/mu_w": 0.18,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.14400000000000002,
"calib/std_conf": 0.3584633872517527,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.71,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.2291955128205127,
"calib/step_q_w": 0.48080448717948726,
"calib/step_q_w_n": 260.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 749.0,
"completions/max_terminated_length": 749.0,
"completions/mean_length": 163.58984375,
"completions/mean_terminated_length": 164.23138427734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.04221513122320175,
"kl": 0.303070068359375,
"learning_rate": 6.111111111111112e-07,
"loss": -0.1459,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0008479391690343618,
"mask/share_reasoning": 0.8811119794845581,
"mask/share_step_conf": 0.11413383483886719,
"num_tokens": 34455999.0,
"reward": 0.004273958504199982,
"reward_std": 0.012088580057024956,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.007773046847432852,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0023501296527683735,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.019323695451021194,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.01888473890721798,
"adv/ratio_final_to_reasoning": 1.0022445932295478,
"adv/ratio_step_to_reasoning": 0.9794776321270714,
"adv/std_final_conf": 0.165586918592453,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.16182544827461243,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.93359375,
"calib/ece": 0.17,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.17,
"calib/mu_c": NaN,
"calib/mu_w": 0.17,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.17,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_w": 0.5037097629009764,
"calib/step_q_w_n": 239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2128.0,
"completions/max_terminated_length": 2128.0,
"completions/mean_length": 157.078125,
"completions/mean_terminated_length": 157.078125,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.03157714381814003,
"kl": 0.33184814453125,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0458,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.0005696614389307797,
"mask/share_reasoning": 0.896777331829071,
"mask/share_step_conf": 0.10265299677848816,
"num_tokens": 34602475.0,
"reward": 0.0022409602534025908,
"reward_std": 0.006338392850011587,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0037933592684566975,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -9.268872963730246e-05,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.038644008338451385,
"adv/mean_abs_reasoning": 0.09640209376811981,
"adv/mean_abs_step_conf": 0.03858642280101776,
"adv/ratio_final_to_reasoning": 0.400862749219986,
"adv/ratio_step_to_reasoning": 0.40026540184730197,
"adv/std_final_conf": 0.23415479063987732,
"adv/std_reasoning": 0.3694343566894531,
"adv/std_step_conf": 0.233805850148201,
"calib/answer_extract_rate": 0.03125,
"calib/auroc": 0.5333333333333333,
"calib/avg_num_step_conf": 0.87109375,
"calib/ece": 0.51125,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.125,
"calib/gap": -0.0020000000000000018,
"calib/mean_conf": 0.54125,
"calib/mu_c": 0.54,
"calib/mu_w": 0.542,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.88671875,
"calib/nonempty_step_conf_rate": 0.86328125,
"calib/pce": 0.33875,
"calib/std_conf": 0.30097497819586266,
"calib/step_conf_rate": 0.86328125,
"calib/step_q_w": 0.5032677130044844,
"calib/step_q_w_n": 223.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2056.0,
"completions/max_terminated_length": 2056.0,
"completions/mean_length": 205.48828125,
"completions/mean_terminated_length": 205.48828125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.192,
"grad_norm": 0.034869104623794556,
"kl": 0.2762451171875,
"learning_rate": 5.555555555555555e-07,
"loss": -0.1626,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0018083257600665092,
"mask/share_reasoning": 0.903313159942627,
"mask/share_step_conf": 0.09487849473953247,
"num_tokens": 34758936.0,
"reward": 0.004252666607499123,
"reward_std": 0.01202835701406002,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.006643359549343586,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.002044275403022766,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.019241439178586006,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.019322792068123817,
"adv/ratio_final_to_reasoning": 0.49898914086005963,
"adv/ratio_step_to_reasoning": 0.5010988691439051,
"adv/std_final_conf": 0.16488207876682281,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.1655791848897934,
"calib/answer_extract_rate": 0.03515625,
"calib/auroc": 0.5714285714285714,
"calib/avg_num_step_conf": 0.90625,
"calib/ece": 0.465,
"calib/final_conf_rate": 0.03125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.125,
"calib/gap": 0.18285714285714294,
"calib/mean_conf": 0.5900000000000001,
"calib/mu_c": 0.75,
"calib/mu_w": 0.5671428571428571,
"calib/nonempty_final_conf_rate": 0.03125,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.875,
"calib/pce": 0.465,
"calib/std_conf": 0.25446021299998944,
"calib/step_conf_rate": 0.875,
"calib/step_q_w": 0.5063456896551723,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2063.0,
"completions/max_terminated_length": 2063.0,
"completions/mean_length": 198.19140625,
"completions/mean_terminated_length": 198.19140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.020284345373511314,
"kl": 0.28765869140625,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0618,
"mask/has_final_conf_rate": 0.03125,
"mask/share_final_conf": 0.0029326127842068672,
"mask/share_reasoning": 0.8988545536994934,
"mask/share_step_conf": 0.0982128456234932,
"num_tokens": 34915937.0,
"reward": -0.0007472168654203415,
"reward_std": 0.004323157016187906,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0004546875134110451,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0035116211511194706,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.03859543055295944,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.038640618324279785,
"adv/ratio_final_to_reasoning": 0.6672647353800345,
"adv/ratio_step_to_reasoning": 0.6680459731027516,
"adv/std_final_conf": 0.2338605523109436,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.2341342270374298,
"calib/answer_extract_rate": 0.01171875,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.96484375,
"calib/ece": 0.4966666666666667,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.24,
"calib/mean_conf": 0.83,
"calib/mu_c": 0.99,
"calib/mu_w": 0.75,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.4966666666666667,
"calib/std_conf": 0.16673332000533067,
"calib/step_conf_rate": 0.953125,
"calib/step_q_w": 0.5249087719298245,
"calib/step_q_w_n": 247.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2943.0,
"completions/max_terminated_length": 2943.0,
"completions/mean_length": 186.1484375,
"completions/mean_terminated_length": 186.1484375,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.044177763164043427,
"kl": 0.3182373046875,
"learning_rate": 5.000000000000001e-07,
"loss": -0.1202,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0004936805926263332,
"mask/share_reasoning": 0.8925653696060181,
"mask/share_step_conf": 0.10694096982479095,
"num_tokens": 35069751.0,
"reward": -0.00030797565705142915,
"reward_std": 0.005734951235353947,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0032421874348074198,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.006201888434588909,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.019319282844662666,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.019303763285279274,
"adv/ratio_final_to_reasoning": 0.33400524285866284,
"adv/ratio_step_to_reasoning": 0.3337369298864584,
"adv/std_final_conf": 0.16554909944534302,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.16541613638401031,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.75,
"calib/avg_num_step_conf": 0.921875,
"calib/ece": 0.07250000000000001,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.25,
"calib/gap": 0.3549999999999999,
"calib/mean_conf": 0.5325,
"calib/mu_c": 0.71,
"calib/mu_w": 0.35500000000000004,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.05250000000000001,
"calib/std_conf": 0.2693858756505248,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_w": 0.5333563559322034,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2107.0,
"completions/max_terminated_length": 2107.0,
"completions/mean_length": 187.48046875,
"completions/mean_terminated_length": 187.48046875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.1952,
"grad_norm": 0.022845527157187462,
"kl": 0.2704925537109375,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.0812,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0006599759799428284,
"mask/share_reasoning": 0.9025527834892273,
"mask/share_step_conf": 0.09678725898265839,
"num_tokens": 35224426.0,
"reward": 0.0018502858001738787,
"reward_std": 0.005233398173004389,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.002724609337747097,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0013677878305315971,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.921875,
"calib/ece": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.0,
"calib/mu_c": NaN,
"calib/mu_w": 0.0,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_w": 0.551475988700565,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1173.0,
"completions/max_terminated_length": 1173.0,
"completions/mean_length": 177.40234375,
"completions/mean_terminated_length": 177.40234375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.0030443009454756975,
"kl": 0.288055419921875,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0214,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.000396286224713549,
"mask/share_reasoning": 0.892707109451294,
"mask/share_step_conf": 0.10689658671617508,
"num_tokens": 35375121.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.01931559108197689,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.019306572154164314,
"adv/ratio_final_to_reasoning": 0.33394141708923064,
"adv/ratio_step_to_reasoning": 0.33378549157177817,
"adv/std_final_conf": 0.16551747918128967,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.16544018685817719,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.125,
"calib/avg_num_step_conf": 0.84375,
"calib/ece": 0.6966666666666667,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": -0.245,
"calib/mean_conf": 0.7633333333333333,
"calib/mu_c": 0.6,
"calib/mu_w": 0.845,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.85546875,
"calib/nonempty_step_conf_rate": 0.8359375,
"calib/pce": 0.5633333333333332,
"calib/std_conf": 0.17123732718721763,
"calib/step_conf_rate": 0.8359375,
"calib/step_q_w": 0.5409444444444444,
"calib/step_q_w_n": 216.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2607.0,
"completions/max_terminated_length": 2607.0,
"completions/mean_length": 206.65234375,
"completions/mean_terminated_length": 206.65234375,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.02343006432056427,
"kl": 0.2517852783203125,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0215,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0009006580803543329,
"mask/share_reasoning": 0.9018073678016663,
"mask/share_step_conf": 0.09729200601577759,
"num_tokens": 35534944.0,
"reward": 0.001522548496723175,
"reward_std": 0.004306417424231768,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0022046875674277544,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0015033404342830181,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.038623660802841187,
"adv/mean_abs_reasoning": 0.06384889036417007,
"adv/mean_abs_step_conf": 0.03805459290742874,
"adv/ratio_final_to_reasoning": 0.6049229764612406,
"adv/ratio_step_to_reasoning": 0.5960102468559696,
"adv/std_final_conf": 0.23403151333332062,
"adv/std_reasoning": 0.2863609790802002,
"adv/std_step_conf": 0.23061054944992065,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.8333333333333333,
"calib/avg_num_step_conf": 0.9375,
"calib/ece": 0.262,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.17833333333333334,
"calib/mean_conf": 0.662,
"calib/mu_c": 0.7333333333333333,
"calib/mu_w": 0.5549999999999999,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.16199999999999998,
"calib/std_conf": 0.23292917378465067,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.98,
"calib/step_q_c_n": 1.0,
"calib/step_q_gap": 0.463876150627615,
"calib/step_q_w": 0.516123849372385,
"calib/step_q_w_n": 239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2867.0,
"completions/max_terminated_length": 2867.0,
"completions/mean_length": 198.6484375,
"completions/mean_terminated_length": 198.6484375,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.1984,
"grad_norm": 0.028277983888983727,
"kl": 0.2912445068359375,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.1085,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0014739439357072115,
"mask/share_reasoning": 0.8970207571983337,
"mask/share_step_conf": 0.10150527954101562,
"num_tokens": 35690838.0,
"reward": 0.003056081011891365,
"reward_std": 0.010295093059539795,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.005044922232627869,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.002839010441675782,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.0579669326543808,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.05779435485601425,
"adv/ratio_final_to_reasoning": 1.0021727811882593,
"adv/ratio_step_to_reasoning": 0.9991891357849126,
"adv/std_final_conf": 0.28678441047668457,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.28593212366104126,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.8828125,
"calib/ece": 0.2025,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.2025,
"calib/mu_c": NaN,
"calib/mu_w": 0.2025,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.87109375,
"calib/nonempty_step_conf_rate": 0.8671875,
"calib/pce": 0.2025,
"calib/std_conf": 0.21787324296480282,
"calib/step_conf_rate": 0.8671875,
"calib/step_q_w": 0.54857802359882,
"calib/step_q_w_n": 226.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1382.0,
"completions/max_terminated_length": 1382.0,
"completions/mean_length": 218.73828125,
"completions/mean_terminated_length": 218.73828125,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.03050275146961212,
"kl": 0.286041259765625,
"learning_rate": 3.611111111111111e-07,
"loss": -0.1996,
"mask/has_final_conf_rate": 0.015625,
"mask/share_final_conf": 0.0014880726812407374,
"mask/share_reasoning": 0.9059315919876099,
"mask/share_step_conf": 0.09258037060499191,
"num_tokens": 35848379.0,
"reward": 0.004234543535858393,
"reward_std": 0.011977097019553185,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.010336718522012234,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.004211381543427706,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.019320379942655563,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.019292619079351425,
"adv/ratio_final_to_reasoning": 1.0020726307629646,
"adv/ratio_step_to_reasoning": 1.0006327832337754,
"adv/std_final_conf": 0.16555850207805634,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.16532061994075775,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.93359375,
"calib/ece": 0.43333333333333335,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.43333333333333335,
"calib/mu_c": NaN,
"calib/mu_w": 0.43333333333333335,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.43333333333333335,
"calib/std_conf": 0.32998316455372223,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_w": 0.5555447698744769,
"calib/step_q_w_n": 239.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1503.0,
"completions/max_terminated_length": 1503.0,
"completions/mean_length": 208.265625,
"completions/mean_terminated_length": 208.265625,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.025662289932370186,
"kl": 0.23193359375,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0553,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0011713827261701226,
"mask/share_reasoning": 0.893328845500946,
"mask/share_step_conf": 0.105499766767025,
"num_tokens": 36005767.0,
"reward": 0.0013519477797672153,
"reward_std": 0.0038238859269768,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0029296875,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0010070418938994408,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.9453125,
"calib/ece": 0.67,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.67,
"calib/mu_c": NaN,
"calib/mu_w": 0.67,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.67,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.921875,
"calib/step_q_w": 0.5065026170798899,
"calib/step_q_w_n": 242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1016.0,
"completions/max_terminated_length": 1016.0,
"completions/mean_length": 180.62890625,
"completions/mean_terminated_length": 180.62890625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.2016,
"grad_norm": 0.002889833180233836,
"kl": 0.284515380859375,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0214,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00011811555305030197,
"mask/share_reasoning": 0.8950053453445435,
"mask/share_step_conf": 0.1048765480518341,
"num_tokens": 36159776.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.019304616376757622,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.01932068169116974,
"adv/ratio_final_to_reasoning": 0.5006275180908379,
"adv/ratio_step_to_reasoning": 0.5010441406397931,
"adv/std_final_conf": 0.1654234230518341,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.16556109488010406,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.4,
"calib/avg_num_step_conf": 0.921875,
"calib/ece": 0.4116666666666666,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.22999999999999998,
"calib/mean_conf": 0.3316666666666667,
"calib/mu_c": 0.14,
"calib/mu_w": 0.37,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.28833333333333333,
"calib/std_conf": 0.30212672102208293,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_w": 0.5487427966101693,
"calib/step_q_w_n": 236.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1556.0,
"completions/max_terminated_length": 1556.0,
"completions/mean_length": 201.04296875,
"completions/mean_terminated_length": 201.04296875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.03346448019146919,
"kl": 0.2524871826171875,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0267,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0015675068134441972,
"mask/share_reasoning": 0.9004506468772888,
"mask/share_step_conf": 0.09798184782266617,
"num_tokens": 36316851.0,
"reward": -1.1547759640961885e-05,
"reward_std": 0.0022423705086112022,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0014062500558793545,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0029918455984443426,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.019319752231240273,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.01928817853331566,
"adv/ratio_final_to_reasoning": 0.5010200369120332,
"adv/ratio_step_to_reasoning": 0.5002012347291533,
"adv/std_final_conf": 0.16555313766002655,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.16528257727622986,
"calib/answer_extract_rate": 0.0078125,
"calib/auroc": 1.0,
"calib/avg_num_step_conf": 0.9921875,
"calib/ece": 0.34,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": 0.31999999999999995,
"calib/mean_conf": 0.69,
"calib/mu_c": 0.85,
"calib/mu_w": 0.53,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.93359375,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.265,
"calib/std_conf": 0.15999999999999998,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_w": 0.5243385826771654,
"calib/step_q_w_n": 254.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1479.0,
"completions/max_terminated_length": 1479.0,
"completions/mean_length": 152.26171875,
"completions/mean_terminated_length": 152.26171875,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.022792931646108627,
"kl": 0.31573486328125,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.073,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0003579213807824999,
"mask/share_reasoning": 0.8862248659133911,
"mask/share_step_conf": 0.11341720819473267,
"num_tokens": 36459998.0,
"reward": 0.0017301104962825775,
"reward_std": 0.004893491044640541,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0028089843690395355,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.0009112633997574449,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.0386478528380394,
"adv/mean_abs_reasoning": 0.10836289823055267,
"adv/mean_abs_step_conf": 0.03857652470469475,
"adv/ratio_final_to_reasoning": 0.35665207805546423,
"adv/ratio_step_to_reasoning": 0.35599384415336893,
"adv/std_final_conf": 0.2341780662536621,
"adv/std_reasoning": 0.36953291296958923,
"adv/std_step_conf": 0.23374615609645844,
"calib/answer_extract_rate": 0.03515625,
"calib/auroc": 0.8333333333333334,
"calib/avg_num_step_conf": 0.8984375,
"calib/ece": 0.21571428571428572,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.42857142857142855,
"calib/gap": 0.49,
"calib/mean_conf": 0.55,
"calib/mu_c": 0.76,
"calib/mu_w": 0.27,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.09714285714285714,
"calib/std_conf": 0.42311768035449837,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_w": 0.45417101449275354,
"calib/step_q_w_n": 230.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1404.0,
"completions/max_terminated_length": 1404.0,
"completions/mean_length": 214.1796875,
"completions/mean_terminated_length": 214.1796875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.2048,
"grad_norm": 0.03924354165792465,
"kl": 0.2512054443359375,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.1604,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0009351474000141025,
"mask/share_reasoning": 0.897925078868866,
"mask/share_step_conf": 0.10113979130983353,
"num_tokens": 36619804.0,
"reward": 0.004981360863894224,
"reward_std": 0.012563186697661877,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.007746484130620956,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0032525130081921816,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.0,
"adv/mean_abs_reasoning": 0.019280418753623962,
"adv/mean_abs_step_conf": 0.0,
"adv/ratio_final_to_reasoning": 0.0,
"adv/ratio_step_to_reasoning": 0.0,
"adv/std_final_conf": 0.0,
"adv/std_reasoning": 0.16521605849266052,
"adv/std_step_conf": 0.0,
"calib/answer_extract_rate": 0.015625,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.90625,
"calib/ece": 0.74,
"calib/final_conf_rate": 0.015625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": -0.18666666666666676,
"calib/mean_conf": 0.8400000000000001,
"calib/mu_c": 0.7,
"calib/mu_w": 0.8866666666666667,
"calib/nonempty_final_conf_rate": 0.015625,
"calib/nonempty_reasoning_rate": 0.90625,
"calib/nonempty_step_conf_rate": 0.890625,
"calib/pce": 0.665,
"calib/std_conf": 0.08803408430829507,
"calib/step_conf_rate": 0.890625,
"calib/step_q_w": 0.6004439655172413,
"calib/step_q_w_n": 232.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 189.59765625,
"completions/mean_terminated_length": 190.3411865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.002885238267481327,
"kl": 0.249420166015625,
"learning_rate": 1.9444444444444447e-07,
"loss": 0.0013,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0010916463797912002,
"mask/share_reasoning": 0.9009318351745605,
"mask/share_step_conf": 0.09407031536102295,
"num_tokens": 36774053.0,
"reward": 0.0003906250058207661,
"reward_std": 0.001104854280129075,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.0,
"rewards/format_reward_step": 0.0,
"rewards/step_l1_reward": 0.0,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.038605351001024246,
"adv/mean_abs_reasoning": 0.07717613130807877,
"adv/mean_abs_step_conf": 0.0577361173927784,
"adv/ratio_final_to_reasoning": 0.5002239727062227,
"adv/ratio_step_to_reasoning": 0.7481084684369843,
"adv/std_final_conf": 0.2339206337928772,
"adv/std_reasoning": 0.3306655287742615,
"adv/std_step_conf": 0.28564655780792236,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.0,
"calib/avg_num_step_conf": 0.91796875,
"calib/ece": 0.43833333333333324,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.0,
"calib/gap": -0.2875,
"calib/mean_conf": 0.25166666666666665,
"calib/mu_c": 0.06,
"calib/mu_w": 0.3475,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.1783333333333333,
"calib/std_conf": 0.20843997270730538,
"calib/step_conf_rate": 0.90625,
"calib/step_q_c": 0.135,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": -0.397555078683834,
"calib/step_q_w": 0.532555078683834,
"calib/step_q_w_n": 233.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 795.0,
"completions/max_terminated_length": 795.0,
"completions/mean_length": 187.640625,
"completions/mean_terminated_length": 188.37648010253906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.029423270374536514,
"kl": 0.246551513671875,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.1481,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0009616934694349766,
"mask/share_reasoning": 0.9088914394378662,
"mask/share_step_conf": 0.08624064922332764,
"num_tokens": 36928033.0,
"reward": 0.00113745778799057,
"reward_std": 0.009823394939303398,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.0035183595027774572,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.005930944345891476,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.057969868183135986,
"adv/mean_abs_reasoning": 0.1246509775519371,
"adv/mean_abs_step_conf": 0.05788629502058029,
"adv/ratio_final_to_reasoning": 0.4650574694368702,
"adv/ratio_step_to_reasoning": 0.46438701209913397,
"adv/std_final_conf": 0.28679895401000977,
"adv/std_reasoning": 0.4047554135322571,
"adv/std_step_conf": 0.28638574481010437,
"calib/answer_extract_rate": 0.04296875,
"calib/auroc": 0.41666666666666663,
"calib/avg_num_step_conf": 0.90234375,
"calib/ece": 0.5057142857142857,
"calib/final_conf_rate": 0.02734375,
"calib/format_rate": 0.01171875,
"calib/frac_conf_gt_0.9": 0.2857142857142857,
"calib/gap": -0.05166666666666664,
"calib/mean_conf": 0.3628571428571429,
"calib/mu_c": 0.3333333333333333,
"calib/mu_w": 0.38499999999999995,
"calib/nonempty_final_conf_rate": 0.02734375,
"calib/nonempty_reasoning_rate": 0.921875,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.21999999999999997,
"calib/std_conf": 0.41337337967073806,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_w": 0.5457682539682539,
"calib/step_q_w_n": 231.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2341.0,
"completions/max_terminated_length": 2341.0,
"completions/mean_length": 199.84375,
"completions/mean_terminated_length": 199.84375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.208,
"grad_norm": 0.03922109678387642,
"kl": 0.25457763671875,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.2147,
"mask/has_final_conf_rate": 0.02734375,
"mask/share_final_conf": 0.0022088377736508846,
"mask/share_reasoning": 0.8914566040039062,
"mask/share_step_conf": 0.10633458197116852,
"num_tokens": 37085177.0,
"reward": 0.0062236604280769825,
"reward_std": 0.015905946493148804,
"rewards/accuracy_reward_step": 0.01953125,
"rewards/final_brier_reward_step": 0.011027734726667404,
"rewards/format_reward_step": 0.01171875,
"rewards/step_l1_reward": -0.004830413497984409,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.01932217739522457,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.01932252198457718,
"adv/ratio_final_to_reasoning": 0.5010829288029016,
"adv/ratio_step_to_reasoning": 0.5010918650546763,
"adv/std_final_conf": 0.16557389497756958,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.16557686030864716,
"calib/answer_extract_rate": 0.01171875,
"calib/avg_num_step_conf": 0.96484375,
"calib/ece": 0.38,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.38,
"calib/mu_c": NaN,
"calib/mu_w": 0.38,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.38,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_w": 0.5592665317139002,
"calib/step_q_w_n": 247.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1046.0,
"completions/max_terminated_length": 1046.0,
"completions/mean_length": 162.5625,
"completions/mean_terminated_length": 162.5625,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.02420944906771183,
"kl": 0.298065185546875,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0729,
"mask/has_final_conf_rate": 0.00390625,
"mask/share_final_conf": 0.00036951014772057533,
"mask/share_reasoning": 0.8943559527397156,
"mask/share_step_conf": 0.10527454316616058,
"num_tokens": 37229337.0,
"reward": 0.0007347895880229771,
"reward_std": 0.0020782987121492624,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.003342187497764826,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.003435108345001936,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.01926400512456894,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.01931987702846527,
"adv/ratio_final_to_reasoning": 0.3330495630607619,
"adv/ratio_step_to_reasoning": 0.33401551552286507,
"adv/std_final_conf": 0.16507543623447418,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.16555418074131012,
"calib/answer_extract_rate": 0.0234375,
"calib/auroc": 0.875,
"calib/avg_num_step_conf": 0.94140625,
"calib/ece": 0.23666666666666664,
"calib/final_conf_rate": 0.0234375,
"calib/format_rate": 0.00390625,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.4974999999999999,
"calib/mean_conf": 0.4333333333333333,
"calib/mu_c": 0.7649999999999999,
"calib/mu_w": 0.2675,
"calib/nonempty_final_conf_rate": 0.0234375,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.1683333333333333,
"calib/std_conf": 0.4008186067980821,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_w": 0.4886099585062241,
"calib/step_q_w_n": 241.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1189.0,
"completions/max_terminated_length": 1189.0,
"completions/mean_length": 157.53125,
"completions/mean_terminated_length": 157.53125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.025260915979743004,
"kl": 0.313568115234375,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0755,
"mask/has_final_conf_rate": 0.0234375,
"mask/share_final_conf": 0.0016457033343613148,
"mask/share_reasoning": 0.8901784420013428,
"mask/share_step_conf": 0.10817582905292511,
"num_tokens": 37374721.0,
"reward": 5.584879545494914e-05,
"reward_std": 0.004261452704668045,
"rewards/accuracy_reward_step": 0.0078125,
"rewards/final_brier_reward_step": 0.0006000000284984708,
"rewards/format_reward_step": 0.00390625,
"rewards/step_l1_reward": -0.00283205253072083,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.03863754868507385,
"adv/mean_abs_reasoning": 0.05784125626087189,
"adv/mean_abs_step_conf": 0.03862864524126053,
"adv/ratio_final_to_reasoning": 0.6679929030381582,
"adv/ratio_step_to_reasoning": 0.6678389740886698,
"adv/std_final_conf": 0.2341156303882599,
"adv/std_reasoning": 0.2861626148223877,
"adv/std_step_conf": 0.23406170308589935,
"calib/answer_extract_rate": 0.015625,
"calib/avg_num_step_conf": 0.9296875,
"calib/ece": 0.3533333333333333,
"calib/final_conf_rate": 0.01171875,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.3533333333333333,
"calib/mu_c": NaN,
"calib/mu_w": 0.3533333333333333,
"calib/nonempty_final_conf_rate": 0.01171875,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.3533333333333333,
"calib/std_conf": 0.281701181317288,
"calib/step_conf_rate": 0.921875,
"calib/step_q_w": 0.5330756302521008,
"calib/step_q_w_n": 238.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2469.0,
"completions/max_terminated_length": 2469.0,
"completions/mean_length": 180.39453125,
"completions/mean_terminated_length": 180.39453125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.2112,
"grad_norm": 0.04118936508893967,
"kl": 0.31329345703125,
"learning_rate": 5.555555555555556e-08,
"loss": -0.1535,
"mask/has_final_conf_rate": 0.01171875,
"mask/share_final_conf": 0.0013754223473370075,
"mask/share_reasoning": 0.8814821243286133,
"mask/share_step_conf": 0.11714246869087219,
"num_tokens": 37526286.0,
"reward": 0.002013332908973098,
"reward_std": 0.005694565363228321,
"rewards/accuracy_reward_step": 0.00390625,
"rewards/final_brier_reward_step": 0.005898046772927046,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.004215131048113108,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.03863748535513878,
"adv/mean_abs_reasoning": 0.038560837507247925,
"adv/mean_abs_step_conf": 0.03851601108908653,
"adv/ratio_final_to_reasoning": 1.0019877122190732,
"adv/ratio_step_to_reasoning": 0.9988375144042718,
"adv/std_final_conf": 0.23411524295806885,
"adv/std_reasoning": 0.233650803565979,
"adv/std_step_conf": 0.2333802878856659,
"calib/answer_extract_rate": 0.0078125,
"calib/avg_num_step_conf": 0.9453125,
"calib/ece": 0.495,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.495,
"calib/mu_c": NaN,
"calib/mu_w": 0.495,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.495,
"calib/std_conf": 0.18500000000000003,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_w": 0.5458679063360882,
"calib/step_q_w_n": 242.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 174.23046875,
"completions/mean_terminated_length": 174.23046875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.040901776403188705,
"kl": 0.3167266845703125,
"learning_rate": 2.777777777777778e-08,
"loss": -0.1257,
"mask/has_final_conf_rate": 0.0078125,
"mask/share_final_conf": 0.0010554521577432752,
"mask/share_reasoning": 0.8885200023651123,
"mask/share_step_conf": 0.11042454093694687,
"num_tokens": 37675089.0,
"reward": 0.0022908253595232964,
"reward_std": 0.0064794328063726425,
"rewards/accuracy_reward_step": 0.0,
"rewards/final_brier_reward_step": 0.005630859173834324,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.0026117085944861174,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.038646847009658813,
"adv/mean_abs_reasoning": 0.0578957125544548,
"adv/mean_abs_step_conf": 0.0386156402528286,
"adv/ratio_final_to_reasoning": 0.6675251984040935,
"adv/ratio_step_to_reasoning": 0.6669861816884626,
"adv/std_final_conf": 0.23417198657989502,
"adv/std_reasoning": 0.2864321172237396,
"adv/std_step_conf": 0.2339828908443451,
"calib/answer_extract_rate": 0.02734375,
"calib/auroc": 0.5,
"calib/avg_num_step_conf": 0.921875,
"calib/ece": 0.26600000000000007,
"calib/final_conf_rate": 0.01953125,
"calib/format_rate": 0.0078125,
"calib/frac_conf_gt_0.9": 0.2,
"calib/gap": 0.21833333333333338,
"calib/mean_conf": 0.746,
"calib/mu_c": 0.8333333333333334,
"calib/mu_w": 0.615,
"calib/nonempty_final_conf_rate": 0.01953125,
"calib/nonempty_reasoning_rate": 0.92578125,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.20600000000000007,
"calib/std_conf": 0.2646960521050512,
"calib/step_conf_rate": 0.90625,
"calib/step_q_c": 0.485,
"calib/step_q_c_n": 2.0,
"calib/step_q_gap": -0.08322578347578347,
"calib/step_q_w": 0.5682257834757835,
"calib/step_q_w_n": 234.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2144.0,
"completions/max_terminated_length": 2144.0,
"completions/mean_length": 215.08984375,
"completions/mean_terminated_length": 215.08984375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.05663759633898735,
"kl": 0.2462615966796875,
"learning_rate": 0.0,
"loss": -0.1622,
"mask/has_final_conf_rate": 0.01953125,
"mask/share_final_conf": 0.0006202845834195614,
"mask/share_reasoning": 0.9045542478561401,
"mask/share_step_conf": 0.09482549130916595,
"num_tokens": 37838200.0,
"reward": 0.004006184637546539,
"reward_std": 0.01133120059967041,
"rewards/accuracy_reward_step": 0.01171875,
"rewards/final_brier_reward_step": 0.00742187537252903,
"rewards/format_reward_step": 0.0078125,
"rewards/step_l1_reward": -0.003315756330266595,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.061138040876830925,
"train_runtime": 9391.5886,
"train_samples_per_second": 5.452,
"train_steps_per_second": 0.021
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 37838200,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}