Files
PureRL-1.5B-v7-s2-l2-kl-w0-b2/trainer_state.json
ModelHub XC 9da0124caf 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-kl-w0-b2
Source: Original Platform
2026-06-04 15:57:29 +08:00

12243 lines
503 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.47714588046073914,
"adv/mean_abs_step_conf": 0.7490277290344238,
"adv/ratio_final_to_reasoning": 1.622059942565935,
"adv/ratio_step_to_reasoning": 1.5698086470140988,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.7393431663513184,
"adv/std_step_conf": 0.9343300461769104,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04301927983760834,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0135,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 0.8933746814727783,
"reward_std": 0.19672557711601257,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420004606246948,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.5104547739028931,
"adv/mean_abs_step_conf": 0.7698483467102051,
"adv/ratio_final_to_reasoning": 1.503115479781084,
"adv/ratio_step_to_reasoning": 1.5081617139634353,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.7575037479400635,
"adv/std_step_conf": 0.9345317482948303,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.04039499908685684,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0158,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 0.8337589502334595,
"reward_std": 0.1928534209728241,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291916012763977,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.7824219465255737,
"adv/mean_abs_reasoning": 0.49416670203208923,
"adv/mean_abs_step_conf": 0.7591285705566406,
"adv/ratio_final_to_reasoning": 1.583315798713541,
"adv/ratio_step_to_reasoning": 1.5361791222172347,
"adv/std_final_conf": 0.931019127368927,
"adv/std_reasoning": 0.7392831444740295,
"adv/std_step_conf": 0.9340925216674805,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.41486037234042555,
"calib/avg_num_step_conf": 4.84375,
"calib/ece": 0.25031496062992126,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.2992125984251969,
"calib/gap": -0.012628989361702203,
"calib/mean_conf": 0.8802362204724409,
"calib/mu_c": 0.8755625,
"calib/mu_w": 0.8881914893617022,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25031496062992126,
"calib/std_conf": 0.04860194362675066,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.8069333333333334,
"calib/step_q_c_n": 675.0,
"calib/step_q_gap": 0.04603067846607678,
"calib/step_q_w": 0.7609026548672566,
"calib/step_q_w_n": 565.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2572.0,
"completions/max_terminated_length": 2572.0,
"completions/mean_length": 499.26171875,
"completions/mean_terminated_length": 501.2196350097656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.04080616310238838,
"kl": 0.0015122145414352417,
"learning_rate": 7.5e-07,
"loss": 0.0555,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.032926708459854126,
"mask/share_reasoning": 0.8567208051681519,
"mask/share_step_conf": 0.10644622147083282,
"num_tokens": 691728.0,
"reward": 0.878947377204895,
"reward_std": 0.1959269642829895,
"rewards/accuracy_reward_step": 0.625,
"rewards/final_brier_reward_step": 0.6897921562194824,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7454462647438049,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.760870099067688,
"adv/mean_abs_reasoning": 0.5114138722419739,
"adv/mean_abs_step_conf": 0.7488071918487549,
"adv/ratio_final_to_reasoning": 1.487777591429285,
"adv/ratio_step_to_reasoning": 1.4641902234017992,
"adv/std_final_conf": 0.9306450486183167,
"adv/std_reasoning": 0.7575408220291138,
"adv/std_step_conf": 0.9347666501998901,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4202674897119342,
"calib/avg_num_step_conf": 4.8125,
"calib/ece": 0.23678571428571432,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.24603174603174602,
"calib/gap": -0.010506172839506278,
"calib/mean_conf": 0.8775793650793651,
"calib/mu_c": 0.8738271604938271,
"calib/mu_w": 0.8843333333333334,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.23575396825396827,
"calib/std_conf": 0.052486710816122675,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7923658536585366,
"calib/step_q_c_n": 820.0,
"calib/step_q_gap": 0.01923478569737147,
"calib/step_q_w": 0.7731310679611652,
"calib/step_q_w_n": 412.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2321.0,
"completions/max_terminated_length": 2321.0,
"completions/mean_length": 503.68359375,
"completions/mean_terminated_length": 503.68359375,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.0455099456012249,
"kl": 0.0005451589822769165,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.027,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0339834988117218,
"mask/share_reasoning": 0.8529292345046997,
"mask/share_step_conf": 0.1130872368812561,
"num_tokens": 926839.0,
"reward": 0.8701273202896118,
"reward_std": 0.20101894438266754,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6966761350631714,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7201408743858337,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7657008767127991,
"adv/mean_abs_reasoning": 0.39336252212524414,
"adv/mean_abs_step_conf": 0.7815566658973694,
"adv/ratio_final_to_reasoning": 1.946552692859247,
"adv/ratio_step_to_reasoning": 1.9868610300615437,
"adv/std_final_conf": 0.9317295551300049,
"adv/std_reasoning": 0.6815266609191895,
"adv/std_step_conf": 0.9331948757171631,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4420645161290322,
"calib/avg_num_step_conf": 4.71875,
"calib/ece": 0.3807630522088353,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.27309236947791166,
"calib/gap": -0.011701290322580715,
"calib/mean_conf": 0.8732128514056225,
"calib/mu_c": 0.8673387096774194,
"calib/mu_w": 0.8790400000000002,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.37799196787148587,
"calib/std_conf": 0.05030947274314032,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7987844408427877,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.011102545749725135,
"calib/step_q_w": 0.7876818950930625,
"calib/step_q_w_n": 591.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2331.0,
"completions/max_terminated_length": 2331.0,
"completions/mean_length": 496.82421875,
"completions/mean_terminated_length": 498.7725830078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.042361173778772354,
"kl": 0.0002884864807128906,
"learning_rate": 1.25e-06,
"loss": 0.0051,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.034748584032058716,
"mask/share_reasoning": 0.8492770791053772,
"mask/share_step_conf": 0.1120680719614029,
"num_tokens": 1160714.0,
"reward": 0.7748649716377258,
"reward_std": 0.16152727603912354,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.5831875205039978,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.675917387008667,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7618625164031982,
"adv/mean_abs_reasoning": 0.2755002975463867,
"adv/mean_abs_step_conf": 0.7575180530548096,
"adv/ratio_final_to_reasoning": 2.765378198094038,
"adv/ratio_step_to_reasoning": 2.749608838180163,
"adv/std_final_conf": 0.9300011992454529,
"adv/std_reasoning": 0.5726578831672668,
"adv/std_step_conf": 0.9342772364616394,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5214813870474249,
"calib/avg_num_step_conf": 4.83984375,
"calib/ece": 0.29996062992125977,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3228346456692913,
"calib/gap": 0.004041305456399913,
"calib/mean_conf": 0.8826377952755905,
"calib/mu_c": 0.8843243243243243,
"calib/mu_w": 0.8802830188679244,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29996062992125977,
"calib/std_conf": 0.04387404471739072,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7977023121387283,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.0004079794147795468,
"calib/step_q_w": 0.7972943327239488,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2624.0,
"completions/max_terminated_length": 2624.0,
"completions/mean_length": 447.80859375,
"completions/mean_terminated_length": 447.80859375,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.0064,
"grad_norm": 0.039196934551000595,
"kl": 0.0003452599048614502,
"learning_rate": 1.5e-06,
"loss": 0.0626,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.037614528089761734,
"mask/share_reasoning": 0.8385595083236694,
"mask/share_step_conf": 0.12382596731185913,
"num_tokens": 1381305.0,
"reward": 0.8490424156188965,
"reward_std": 0.13823771476745605,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6616894602775574,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.722332775592804,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7597512006759644,
"adv/mean_abs_reasoning": 0.4481978416442871,
"adv/mean_abs_step_conf": 0.7643946409225464,
"adv/ratio_final_to_reasoning": 1.6951246304281449,
"adv/ratio_step_to_reasoning": 1.7054848772101168,
"adv/std_final_conf": 0.930804967880249,
"adv/std_reasoning": 0.7205727696418762,
"adv/std_step_conf": 0.9342158436775208,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.5157617044012556,
"calib/avg_num_step_conf": 5.6328125,
"calib/ece": 0.24736220472440945,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.29133858267716534,
"calib/gap": 0.004130768717024003,
"calib/mean_conf": 0.8812204724409448,
"calib/mu_c": 0.8827329192546585,
"calib/mu_w": 0.8786021505376345,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.24736220472440945,
"calib/std_conf": 0.046791616010890866,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7910278372591005,
"calib/step_q_c_n": 934.0,
"calib/step_q_gap": 0.00996484513311624,
"calib/step_q_w": 0.7810629921259843,
"calib/step_q_w_n": 508.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1572.0,
"completions/max_terminated_length": 1572.0,
"completions/mean_length": 547.94140625,
"completions/mean_terminated_length": 550.0902099609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.036365069448947906,
"kl": 0.0002646446228027344,
"learning_rate": 1.75e-06,
"loss": 0.0327,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.029931582510471344,
"mask/share_reasoning": 0.8557358980178833,
"mask/share_step_conf": 0.11042627692222595,
"num_tokens": 1629002.0,
"reward": 0.8819910287857056,
"reward_std": 0.18995745480060577,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6971203088760376,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7434240579605103,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7698913812637329,
"adv/mean_abs_reasoning": 0.47465720772743225,
"adv/mean_abs_step_conf": 0.7837836742401123,
"adv/ratio_final_to_reasoning": 1.6219945019898157,
"adv/ratio_step_to_reasoning": 1.6512625563882581,
"adv/std_final_conf": 0.93187016248703,
"adv/std_reasoning": 0.7392329573631287,
"adv/std_step_conf": 0.9343845248222351,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.49318181818181817,
"calib/avg_num_step_conf": 4.9453125,
"calib/ece": 0.3134920634920635,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.28174603174603174,
"calib/gap": 0.004327784891165032,
"calib/mean_conf": 0.876984126984127,
"calib/mu_c": 0.8788732394366198,
"calib/mu_w": 0.8745454545454547,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3134920634920635,
"calib/std_conf": 0.05139745286243065,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.798998482549317,
"calib/step_q_c_n": 659.0,
"calib/step_q_gap": 0.05534115141257889,
"calib/step_q_w": 0.7436573311367382,
"calib/step_q_w_n": 607.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2775.0,
"completions/max_terminated_length": 2775.0,
"completions/mean_length": 529.98046875,
"completions/mean_terminated_length": 529.98046875,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.05035858228802681,
"kl": 0.0004501938819885254,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0013,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03286789357662201,
"mask/share_reasoning": 0.8621331453323364,
"mask/share_step_conf": 0.10499894618988037,
"num_tokens": 1871189.0,
"reward": 0.8457349538803101,
"reward_std": 0.1846785992383957,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6442609429359436,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7401776909828186,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.7846425175666809,
"adv/mean_abs_reasoning": 0.5143224000930786,
"adv/mean_abs_step_conf": 0.7686464786529541,
"adv/ratio_final_to_reasoning": 1.5255849588209294,
"adv/ratio_step_to_reasoning": 1.4944837683792298,
"adv/std_final_conf": 0.932033896446228,
"adv/std_reasoning": 0.7576586604118347,
"adv/std_step_conf": 0.9345789551734924,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4397239665096808,
"calib/avg_num_step_conf": 4.84765625,
"calib/ece": 0.29884462151394425,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2549800796812749,
"calib/gap": -0.009276556776556588,
"calib/mean_conf": 0.878605577689243,
"calib/mu_c": 0.874761904761905,
"calib/mu_w": 0.8840384615384616,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2958964143426295,
"calib/std_conf": 0.0430144954809309,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7825727411944869,
"calib/step_q_c_n": 653.0,
"calib/step_q_gap": 0.054018319425779504,
"calib/step_q_w": 0.7285544217687074,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2660.0,
"completions/max_terminated_length": 2660.0,
"completions/mean_length": 510.5859375,
"completions/mean_terminated_length": 514.6063232421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0096,
"grad_norm": 0.04146807640790939,
"kl": 0.005201190710067749,
"learning_rate": 2.25e-06,
"loss": -0.0547,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03358618915081024,
"mask/share_reasoning": 0.8540961742401123,
"mask/share_step_conf": 0.10450513660907745,
"num_tokens": 2109435.0,
"reward": 0.8226215839385986,
"reward_std": 0.2236328274011612,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6483156085014343,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.6867712736129761,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.7660097479820251,
"adv/mean_abs_reasoning": 0.3731532096862793,
"adv/mean_abs_step_conf": 0.7574515342712402,
"adv/ratio_final_to_reasoning": 2.0528022487761306,
"adv/ratio_step_to_reasoning": 2.0298673965796827,
"adv/std_final_conf": 0.9304073452949524,
"adv/std_reasoning": 0.6611678600311279,
"adv/std_step_conf": 0.9340184330940247,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5357142857142858,
"calib/avg_num_step_conf": 5.01171875,
"calib/ece": 0.2694094488188976,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.27165354330708663,
"calib/gap": 0.013868831168830975,
"calib/mean_conf": 0.8757086614173228,
"calib/mu_c": 0.881168831168831,
"calib/mu_w": 0.8673000000000001,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2694094488188976,
"calib/std_conf": 0.06993310754770636,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7612042620689656,
"calib/step_q_c_n": 725.0,
"calib/step_q_gap": -0.01596419671239646,
"calib/step_q_w": 0.7771684587813621,
"calib/step_q_w_n": 558.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2128.0,
"completions/max_terminated_length": 2128.0,
"completions/mean_length": 506.41796875,
"completions/mean_terminated_length": 506.41796875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.04930815100669861,
"kl": 0.0004813075065612793,
"learning_rate": 2.5e-06,
"loss": 0.0962,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03370947390794754,
"mask/share_reasoning": 0.8564748764038086,
"mask/share_step_conf": 0.10981567949056625,
"num_tokens": 2345878.0,
"reward": 0.8647788763046265,
"reward_std": 0.16030901670455933,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6840370893478394,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7275517582893372,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7875567674636841,
"adv/mean_abs_reasoning": 0.405804842710495,
"adv/mean_abs_step_conf": 0.7895094752311707,
"adv/ratio_final_to_reasoning": 1.9407278686063747,
"adv/ratio_step_to_reasoning": 1.9455398066661669,
"adv/std_final_conf": 0.928180456161499,
"adv/std_reasoning": 0.6816299557685852,
"adv/std_step_conf": 0.933238685131073,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.42639821029082775,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.305748031496063,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3110236220472441,
"calib/gap": -0.021482901885586325,
"calib/mean_conf": 0.8778740157480315,
"calib/mu_c": 0.8689932885906041,
"calib/mu_w": 0.8904761904761904,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.29850393700787403,
"calib/std_conf": 0.07778559186119627,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7642151238591917,
"calib/step_q_c_n": 767.0,
"calib/step_q_gap": -0.01411560937013745,
"calib/step_q_w": 0.7783307332293291,
"calib/step_q_w_n": 641.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1749.0,
"completions/max_terminated_length": 1749.0,
"completions/mean_length": 528.2890625,
"completions/mean_terminated_length": 530.36083984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.03836844116449356,
"kl": 0.0006773471832275391,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.002,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032403960824012756,
"mask/share_reasoning": 0.8480488657951355,
"mask/share_step_conf": 0.11564093828201294,
"num_tokens": 2585600.0,
"reward": 0.8380727171897888,
"reward_std": 0.1705556958913803,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6433242559432983,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7195398807525635,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7522432804107666,
"adv/mean_abs_reasoning": 0.495732843875885,
"adv/mean_abs_step_conf": 0.7614065408706665,
"adv/ratio_final_to_reasoning": 1.5174368406364926,
"adv/ratio_step_to_reasoning": 1.5359211121006486,
"adv/std_final_conf": 0.9305465817451477,
"adv/std_reasoning": 0.7576424479484558,
"adv/std_step_conf": 0.9341990351676941,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5511680869820746,
"calib/avg_num_step_conf": 5.5390625,
"calib/ece": 0.203266129032258,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.2903225806451613,
"calib/gap": 0.015575962386129905,
"calib/mean_conf": 0.8726209677419355,
"calib/mu_c": 0.8777710843373494,
"calib/mu_w": 0.8621951219512195,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.203266129032258,
"calib/std_conf": 0.059194743219900266,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7626148409893994,
"calib/step_q_c_n": 849.0,
"calib/step_q_gap": 0.015245772448098771,
"calib/step_q_w": 0.7473690685413006,
"calib/step_q_w_n": 569.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2769.0,
"completions/max_terminated_length": 2769.0,
"completions/mean_length": 479.0,
"completions/mean_terminated_length": 482.7716369628906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.0128,
"grad_norm": 0.03968612849712372,
"kl": 0.001405954360961914,
"learning_rate": 3e-06,
"loss": 0.0169,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.036165084689855576,
"mask/share_reasoning": 0.828498125076294,
"mask/share_step_conf": 0.12752431631088257,
"num_tokens": 2812400.0,
"reward": 0.9073599576950073,
"reward_std": 0.20147816836833954,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7176058888435364,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7728952169418335,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7512305974960327,
"adv/mean_abs_reasoning": 0.39446693658828735,
"adv/mean_abs_step_conf": 0.7417395114898682,
"adv/ratio_final_to_reasoning": 1.9044196808821683,
"adv/ratio_step_to_reasoning": 1.8803591446855172,
"adv/std_final_conf": 0.9301111698150635,
"adv/std_reasoning": 0.6814743280410767,
"adv/std_step_conf": 0.9348357319831848,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5096972095784682,
"calib/avg_num_step_conf": 4.69921875,
"calib/ece": 0.24792968750000005,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.36328125,
"calib/gap": 0.0034217296655455476,
"calib/mean_conf": 0.8812109375,
"calib/mu_c": 0.8824539877300616,
"calib/mu_w": 0.879032258064516,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.24621093750000006,
"calib/std_conf": 0.05488356316212618,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7557579787234041,
"calib/step_q_c_n": 752.0,
"calib/step_q_gap": 0.009505207104778979,
"calib/step_q_w": 0.7462527716186251,
"calib/step_q_w_n": 451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1367.0,
"completions/max_terminated_length": 1367.0,
"completions/mean_length": 460.07421875,
"completions/mean_terminated_length": 461.8784484863281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.038122136145830154,
"kl": 0.0019731521606445312,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0146,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03544770926237106,
"mask/share_reasoning": 0.8494640588760376,
"mask/share_step_conf": 0.11118200421333313,
"num_tokens": 3034771.0,
"reward": 0.8951612710952759,
"reward_std": 0.17123009264469147,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.6993730068206787,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7659494876861572,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7730479836463928,
"adv/mean_abs_reasoning": 0.5005540251731873,
"adv/mean_abs_step_conf": 0.7642529606819153,
"adv/ratio_final_to_reasoning": 1.544384711278518,
"adv/ratio_step_to_reasoning": 1.5268141344333221,
"adv/std_final_conf": 0.9311867356300354,
"adv/std_reasoning": 0.7394110560417175,
"adv/std_step_conf": 0.9345505237579346,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.47611822140124027,
"calib/avg_num_step_conf": 5.5078125,
"calib/ece": 0.32891566265060246,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4979919678714859,
"calib/gap": -0.006167040506662724,
"calib/mean_conf": 0.8975903614457832,
"calib/mu_c": 0.8949650349650352,
"calib/mu_w": 0.9011320754716979,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.32610441767068277,
"calib/std_conf": 0.04852665465823042,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.720193298969072,
"calib/step_q_c_n": 776.0,
"calib/step_q_gap": 0.028521374678851186,
"calib/step_q_w": 0.6916719242902208,
"calib/step_q_w_n": 634.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2471.0,
"completions/max_terminated_length": 2471.0,
"completions/mean_length": 526.9140625,
"completions/mean_terminated_length": 533.1620483398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.043646715581417084,
"kl": 0.0052623748779296875,
"learning_rate": 3.5e-06,
"loss": -0.0802,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03268653526902199,
"mask/share_reasoning": 0.8374663591384888,
"mask/share_step_conf": 0.11812833696603775,
"num_tokens": 3275061.0,
"reward": 0.8433182835578918,
"reward_std": 0.1970943808555603,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6279773712158203,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7524092197418213,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7509260773658752,
"adv/mean_abs_reasoning": 0.43354499340057373,
"adv/mean_abs_step_conf": 0.7791382670402527,
"adv/ratio_final_to_reasoning": 1.7320603139154633,
"adv/ratio_step_to_reasoning": 1.7971335822124652,
"adv/std_final_conf": 0.9233602285385132,
"adv/std_reasoning": 0.7012984156608582,
"adv/std_step_conf": 0.9344486594200134,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5131959281136107,
"calib/avg_num_step_conf": 4.91796875,
"calib/ece": 0.33823529411764713,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.596078431372549,
"calib/gap": 0.009061203971346021,
"calib/mean_conf": 0.9107843137254903,
"calib/mu_c": 0.9146575342465755,
"calib/mu_w": 0.9055963302752295,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.33823529411764713,
"calib/std_conf": 0.04947739183668031,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6933720930232558,
"calib/step_q_c_n": 688.0,
"calib/step_q_gap": 0.016401865352502765,
"calib/step_q_w": 0.676970227670753,
"calib/step_q_w_n": 571.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1123.0,
"completions/max_terminated_length": 1123.0,
"completions/mean_length": 454.046875,
"completions/mean_terminated_length": 455.8274841308594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.016,
"grad_norm": 0.032701823860406876,
"kl": 0.0075130462646484375,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0092,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.035241350531578064,
"mask/share_reasoning": 0.8442916870117188,
"mask/share_step_conf": 0.1165606826543808,
"num_tokens": 3499177.0,
"reward": 0.8645692467689514,
"reward_std": 0.1731414645910263,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6403363347053528,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7747396230697632,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7602238655090332,
"adv/mean_abs_reasoning": 0.44012323021888733,
"adv/mean_abs_step_conf": 0.7481175661087036,
"adv/ratio_final_to_reasoning": 1.727297750520802,
"adv/ratio_step_to_reasoning": 1.6997911374426677,
"adv/std_final_conf": 0.9273681044578552,
"adv/std_reasoning": 0.7205715179443359,
"adv/std_step_conf": 0.9346485137939453,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5498666666666667,
"calib/avg_num_step_conf": 5.97265625,
"calib/ece": 0.30691999999999997,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.676,
"calib/gap": 0.026200000000000112,
"calib/mean_conf": 0.9069200000000001,
"calib/mu_c": 0.9174000000000001,
"calib/mu_w": 0.8912,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30691999999999997,
"calib/std_conf": 0.08308858886754547,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6207349272349273,
"calib/step_q_c_n": 962.0,
"calib/step_q_gap": -0.01829505512838847,
"calib/step_q_w": 0.6390299823633158,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2575.0,
"completions/max_terminated_length": 2575.0,
"completions/mean_length": 628.4921875,
"completions/mean_terminated_length": 633.44091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.049372486770153046,
"kl": 0.010541915893554688,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0158,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.026201680302619934,
"mask/share_reasoning": 0.8591794967651367,
"mask/share_step_conf": 0.10680627077817917,
"num_tokens": 3768919.0,
"reward": 0.8785654306411743,
"reward_std": 0.1860627830028534,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6557347774505615,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7888960838317871,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.763668417930603,
"adv/mean_abs_reasoning": 0.4707931876182556,
"adv/mean_abs_step_conf": 0.7634800672531128,
"adv/ratio_final_to_reasoning": 1.6220889299482097,
"adv/ratio_step_to_reasoning": 1.6216888589989187,
"adv/std_final_conf": 0.9241020679473877,
"adv/std_reasoning": 0.7206013798713684,
"adv/std_step_conf": 0.9347437024116516,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5685767790262172,
"calib/avg_num_step_conf": 5.65234375,
"calib/ece": 0.22802371541501973,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7747035573122529,
"calib/gap": 0.01153707865168574,
"calib/mean_conf": 0.9269169960474308,
"calib/mu_c": 0.9303370786516855,
"calib/mu_w": 0.9187999999999997,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.22569169960474306,
"calib/std_conf": 0.04471231081930742,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6131984585741811,
"calib/step_q_c_n": 1038.0,
"calib/step_q_gap": 0.01755053681378993,
"calib/step_q_w": 0.5956479217603912,
"calib/step_q_w_n": 409.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3069.0,
"completions/max_terminated_length": 3069.0,
"completions/mean_length": 534.51171875,
"completions/mean_terminated_length": 534.51171875,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.018133333333333335,
"grad_norm": 3.9036219120025635,
"kl": 13.952611923217773,
"learning_rate": 4.25e-06,
"loss": 0.3213,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03322366625070572,
"mask/share_reasoning": 0.8428764343261719,
"mask/share_step_conf": 0.12389989197254181,
"num_tokens": 4009282.0,
"reward": 0.9437460899353027,
"reward_std": 0.18401694297790527,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7354055047035217,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8161492347717285,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7250304222106934,
"adv/mean_abs_reasoning": 0.4055905044078827,
"adv/mean_abs_step_conf": 0.7682268023490906,
"adv/ratio_final_to_reasoning": 1.7875922003380174,
"adv/ratio_step_to_reasoning": 1.8940946447220621,
"adv/std_final_conf": 0.9232602715492249,
"adv/std_reasoning": 0.7013160586357117,
"adv/std_step_conf": 0.9348356127738953,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.4806957186544342,
"calib/avg_num_step_conf": 4.93359375,
"calib/ece": 0.3623715415019763,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.83399209486166,
"calib/gap": -0.001804918450560411,
"calib/mean_conf": 0.9315415019762846,
"calib/mu_c": 0.9307638888888891,
"calib/mu_w": 0.9325688073394495,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3623715415019763,
"calib/std_conf": 0.04609231217906996,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5888536953242836,
"calib/step_q_c_n": 663.0,
"calib/step_q_gap": -0.011679638009049809,
"calib/step_q_w": 0.6005333333333334,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1899.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 495.640625,
"completions/mean_terminated_length": 497.5843505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.0192,
"grad_norm": 0.03445158153772354,
"kl": 0.014752388000488281,
"learning_rate": 4.5e-06,
"loss": -0.062,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0338444709777832,
"mask/share_reasoning": 0.8541897535324097,
"mask/share_step_conf": 0.10805948078632355,
"num_tokens": 4246886.0,
"reward": 0.843170166015625,
"reward_std": 0.18039628863334656,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6090675592422485,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7686790227890015,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7084481716156006,
"adv/mean_abs_reasoning": 0.34135955572128296,
"adv/mean_abs_step_conf": 0.7834237217903137,
"adv/ratio_final_to_reasoning": 2.07537231561797,
"adv/ratio_step_to_reasoning": 2.2950103744275205,
"adv/std_final_conf": 0.9185612201690674,
"adv/std_reasoning": 0.6610927581787109,
"adv/std_step_conf": 0.934778094291687,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.43786184210526313,
"calib/avg_num_step_conf": 4.6015625,
"calib/ece": 0.3341666666666669,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8650793650793651,
"calib/gap": 0.006036842105263451,
"calib/mean_conf": 0.9373412698412698,
"calib/mu_c": 0.9397368421052633,
"calib/mu_w": 0.9336999999999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3341666666666669,
"calib/std_conf": 0.07056795247028086,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.6050581395348836,
"calib/step_q_c_n": 688.0,
"calib/step_q_gap": 0.040119364024679416,
"calib/step_q_w": 0.5649387755102042,
"calib/step_q_w_n": 490.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2346.0,
"completions/max_terminated_length": 2346.0,
"completions/mean_length": 490.83203125,
"completions/mean_terminated_length": 496.6521911621094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.03231712058186531,
"kl": 0.018993377685546875,
"learning_rate": 4.75e-06,
"loss": -0.0115,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032016873359680176,
"mask/share_reasoning": 0.8505151867866516,
"mask/share_step_conf": 0.10574917495250702,
"num_tokens": 4477299.0,
"reward": 0.8782888650894165,
"reward_std": 0.1701107621192932,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6328773498535156,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8088566064834595,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.7251197099685669,
"adv/mean_abs_reasoning": 0.4031530022621155,
"adv/mean_abs_step_conf": 0.7815650105476379,
"adv/ratio_final_to_reasoning": 1.7986216297531634,
"adv/ratio_step_to_reasoning": 1.9386312545416509,
"adv/std_final_conf": 0.9166728258132935,
"adv/std_reasoning": 0.7012306451797485,
"adv/std_step_conf": 0.934552013874054,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.4580340386979874,
"calib/avg_num_step_conf": 5.3984375,
"calib/ece": 0.3506299212598426,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9409448818897638,
"calib/gap": -0.0028796997346796083,
"calib/mean_conf": 0.9486614173228347,
"calib/mu_c": 0.9475163398692809,
"calib/mu_w": 0.9503960396039605,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3484645669291339,
"calib/std_conf": 0.0423774231365238,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5716861219195849,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.024730311772285463,
"calib/step_q_w": 0.5469558101472994,
"calib/step_q_w_n": 611.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2155.0,
"completions/max_terminated_length": 2155.0,
"completions/mean_length": 460.74609375,
"completions/mean_terminated_length": 462.552978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.03185487538576126,
"kl": 0.024953842163085938,
"learning_rate": 5e-06,
"loss": 0.0105,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.036644428968429565,
"mask/share_reasoning": 0.8307029008865356,
"mask/share_step_conf": 0.1287464201450348,
"num_tokens": 4700122.0,
"reward": 0.8812888860702515,
"reward_std": 0.16984190046787262,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6324000358581543,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.812208890914917,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.737258791923523,
"adv/mean_abs_reasoning": 0.4711818993091583,
"adv/mean_abs_step_conf": 0.759905993938446,
"adv/ratio_final_to_reasoning": 1.5647010061389957,
"adv/ratio_step_to_reasoning": 1.6127656751089372,
"adv/std_final_conf": 0.9146026968955994,
"adv/std_reasoning": 0.7392024993896484,
"adv/std_step_conf": 0.9352498054504395,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5120763901891031,
"calib/avg_num_step_conf": 5.1953125,
"calib/ece": 0.38578124999999985,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9609375,
"calib/gap": -0.0014004868002245319,
"calib/mean_conf": 0.95671875,
"calib/mu_c": 0.9561224489795919,
"calib/mu_w": 0.9575229357798164,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.38414062499999985,
"calib/std_conf": 0.033298117791213055,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5543406593406593,
"calib/step_q_c_n": 728.0,
"calib/step_q_gap": 0.008941988244313914,
"calib/step_q_w": 0.5453986710963454,
"calib/step_q_w_n": 602.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1295.0,
"completions/max_terminated_length": 1295.0,
"completions/mean_length": 492.125,
"completions/mean_terminated_length": 494.054931640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.0224,
"grad_norm": 0.025124864652752876,
"kl": 0.026861190795898438,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0157,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03341342881321907,
"mask/share_reasoning": 0.8461604118347168,
"mask/share_step_conf": 0.11651992797851562,
"num_tokens": 4929066.0,
"reward": 0.8638095855712891,
"reward_std": 0.19264093041419983,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6035058498382568,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8100508451461792,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.6960275173187256,
"adv/mean_abs_reasoning": 0.3700941503047943,
"adv/mean_abs_step_conf": 0.7384877800941467,
"adv/ratio_final_to_reasoning": 1.8806768946375023,
"adv/ratio_step_to_reasoning": 1.9954051678092144,
"adv/std_final_conf": 0.9030138254165649,
"adv/std_reasoning": 0.6815323829650879,
"adv/std_step_conf": 0.9350609183311462,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.45413126884077837,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.31256916996047424,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": -0.002593861331871783,
"calib/mean_conf": 0.960790513833992,
"calib/mu_c": 0.9598780487804879,
"calib/mu_w": 0.9624719101123597,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.31256916996047424,
"calib/std_conf": 0.020200760878050606,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5592044198895028,
"calib/step_q_c_n": 905.0,
"calib/step_q_gap": 0.029487915035133927,
"calib/step_q_w": 0.5297165048543688,
"calib/step_q_w_n": 515.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2621.0,
"completions/max_terminated_length": 2621.0,
"completions/mean_length": 482.3046875,
"completions/mean_terminated_length": 482.3046875,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.04782715439796448,
"kl": 0.041934967041015625,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0586,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0337035208940506,
"mask/share_reasoning": 0.8390634059906006,
"mask/share_step_conf": 0.12723305821418762,
"num_tokens": 5154352.0,
"reward": 0.9010443091392517,
"reward_std": 0.1715245544910431,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.6647961139678955,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8115112781524658,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7233649492263794,
"adv/mean_abs_reasoning": 0.4360201060771942,
"adv/mean_abs_step_conf": 0.7778668403625488,
"adv/ratio_final_to_reasoning": 1.6590174148948829,
"adv/ratio_step_to_reasoning": 1.7840159880719655,
"adv/std_final_conf": 0.9036288261413574,
"adv/std_reasoning": 0.7204331755638123,
"adv/std_step_conf": 0.9352081418037415,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.4829948617567898,
"calib/avg_num_step_conf": 5.375,
"calib/ece": 0.4408203125,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.984375,
"calib/gap": -0.001261316368974863,
"calib/mean_conf": 0.9642578125,
"calib/mu_c": 0.9636567164179104,
"calib/mu_w": 0.9649180327868853,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4408203125,
"calib/std_conf": 0.020940669944269776,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5581728045325779,
"calib/step_q_c_n": 706.0,
"calib/step_q_gap": 0.019948923935563023,
"calib/step_q_w": 0.5382238805970149,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1706.0,
"completions/max_terminated_length": 1706.0,
"completions/mean_length": 508.5703125,
"completions/mean_terminated_length": 510.5647277832031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.025247525423765182,
"kl": 0.031337738037109375,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0069,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03410639613866806,
"mask/share_reasoning": 0.8367390632629395,
"mask/share_step_conf": 0.1252482831478119,
"num_tokens": 5388482.0,
"reward": 0.824210524559021,
"reward_std": 0.1818922758102417,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.5551589727401733,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.7885745763778687,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7442210912704468,
"adv/mean_abs_reasoning": 0.625908374786377,
"adv/mean_abs_step_conf": 0.7788434028625488,
"adv/ratio_final_to_reasoning": 1.189025616607942,
"adv/ratio_step_to_reasoning": 1.2443409199123894,
"adv/std_final_conf": 0.9220814108848572,
"adv/std_reasoning": 0.8429231643676758,
"adv/std_step_conf": 0.9353823661804199,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.5473770491803278,
"calib/avg_num_step_conf": 6.10546875,
"calib/ece": 0.4607692307692309,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9838056680161943,
"calib/gap": 0.0032131147540985783,
"calib/mean_conf": 0.9644129554655871,
"calib/mu_c": 0.9660000000000002,
"calib/mu_w": 0.9627868852459016,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.45955465587044547,
"calib/std_conf": 0.026133407237792707,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5604884004884005,
"calib/step_q_c_n": 819.0,
"calib/step_q_gap": 0.03531366930560487,
"calib/step_q_w": 0.5251747311827957,
"calib/step_q_w_n": 744.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2993.0,
"completions/max_terminated_length": 2993.0,
"completions/mean_length": 593.07421875,
"completions/mean_terminated_length": 593.07421875,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0256,
"grad_norm": 0.03646933659911156,
"kl": 0.027853012084960938,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0254,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03060779720544815,
"mask/share_reasoning": 0.8466688394546509,
"mask/share_step_conf": 0.12272335588932037,
"num_tokens": 5644821.0,
"reward": 0.7971993088722229,
"reward_std": 0.24867865443229675,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5218691229820251,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7819044589996338,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.7325584888458252,
"adv/mean_abs_reasoning": 0.4987294375896454,
"adv/mean_abs_step_conf": 0.749326229095459,
"adv/ratio_final_to_reasoning": 1.468849507633384,
"adv/ratio_step_to_reasoning": 1.5024704230753763,
"adv/std_final_conf": 0.9082023501396179,
"adv/std_reasoning": 0.7576895356178284,
"adv/std_step_conf": 0.9353220462799072,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6032865389661507,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.37721115537848604,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9880478087649402,
"calib/gap": 0.007340593020204844,
"calib/mean_conf": 0.9668525896414343,
"calib/mu_c": 0.9698648648648648,
"calib/mu_w": 0.9625242718446599,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.37721115537848604,
"calib/std_conf": 0.01931223275365144,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.548428927680798,
"calib/step_q_c_n": 802.0,
"calib/step_q_gap": 0.014344420638544464,
"calib/step_q_w": 0.5340845070422535,
"calib/step_q_w_n": 639.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2224.0,
"completions/max_terminated_length": 2224.0,
"completions/mean_length": 487.37109375,
"completions/mean_terminated_length": 487.37109375,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.02042507752776146,
"kl": 0.034358978271484375,
"learning_rate": 4.861111111111111e-06,
"loss": 0.0507,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03349972888827324,
"mask/share_reasoning": 0.8386950492858887,
"mask/share_step_conf": 0.12780524790287018,
"num_tokens": 5872812.0,
"reward": 0.8485996723175049,
"reward_std": 0.22764171659946442,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6068382859230042,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7786422371864319,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.7012295126914978,
"adv/mean_abs_reasoning": 0.4339819550514221,
"adv/mean_abs_step_conf": 0.752228856086731,
"adv/ratio_final_to_reasoning": 1.6158033865910617,
"adv/ratio_step_to_reasoning": 1.7333182804745881,
"adv/std_final_conf": 0.8902946710586548,
"adv/std_reasoning": 0.7205467820167542,
"adv/std_step_conf": 0.935208797454834,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5849214077062178,
"calib/avg_num_step_conf": 5.34375,
"calib/ece": 0.3228915662650604,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9718875502008032,
"calib/gap": 0.018116566977326443,
"calib/mean_conf": 0.957429718875502,
"calib/mu_c": 0.9640506329113924,
"calib/mu_w": 0.9459340659340659,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3228915662650604,
"calib/std_conf": 0.07118489472683348,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5745430463576159,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.057430485183064484,
"calib/step_q_w": 0.5171125611745514,
"calib/step_q_w_n": 613.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2566.0,
"completions/max_terminated_length": 2566.0,
"completions/mean_length": 538.953125,
"completions/mean_terminated_length": 538.953125,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.02381037175655365,
"kl": 0.03139495849609375,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0775,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03021368384361267,
"mask/share_reasoning": 0.8615808486938477,
"mask/share_step_conf": 0.10820543766021729,
"num_tokens": 6116024.0,
"reward": 0.8803870677947998,
"reward_std": 0.1974932700395584,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6489335894584656,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7938718199729919,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7713116407394409,
"adv/mean_abs_reasoning": 0.47726085782051086,
"adv/mean_abs_step_conf": 0.7782514691352844,
"adv/ratio_final_to_reasoning": 1.616121724839872,
"adv/ratio_step_to_reasoning": 1.6306626792930308,
"adv/std_final_conf": 0.9125264883041382,
"adv/std_reasoning": 0.720649242401123,
"adv/std_step_conf": 0.9354422092437744,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5927287581699346,
"calib/avg_num_step_conf": 5.84375,
"calib/ece": 0.4235968379446642,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9802371541501976,
"calib/gap": 0.011120537958773102,
"calib/mean_conf": 0.9574308300395258,
"calib/mu_c": 0.9625735294117647,
"calib/mu_w": 0.9514529914529916,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4217391304347828,
"calib/std_conf": 0.055589902953488125,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5399999999999999,
"calib/step_q_c_n": 757.0,
"calib/step_q_gap": 0.0006224627875507371,
"calib/step_q_w": 0.5393775372124492,
"calib/step_q_w_n": 739.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2235.0,
"completions/max_terminated_length": 2235.0,
"completions/mean_length": 512.63671875,
"completions/mean_terminated_length": 512.63671875,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.0288,
"grad_norm": 0.020446553826332092,
"kl": 0.03476715087890625,
"learning_rate": 4.805555555555556e-06,
"loss": 0.031,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032595522701740265,
"mask/share_reasoning": 0.8426357507705688,
"mask/share_step_conf": 0.12476875633001328,
"num_tokens": 6352475.0,
"reward": 0.8334004282951355,
"reward_std": 0.20353764295578003,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5707800388336182,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7921144962310791,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.7157641649246216,
"adv/mean_abs_reasoning": 0.3361932337284088,
"adv/mean_abs_step_conf": 0.7597331404685974,
"adv/ratio_final_to_reasoning": 2.129026087130731,
"adv/ratio_step_to_reasoning": 2.2598109189857825,
"adv/std_final_conf": 0.8849893808364868,
"adv/std_reasoning": 0.6185228228569031,
"adv/std_step_conf": 0.9350174069404602,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5315514735033698,
"calib/avg_num_step_conf": 5.234375,
"calib/ece": 0.32996078431372555,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9686274509803922,
"calib/gap": 0.011650588079820556,
"calib/mean_conf": 0.9575686274509804,
"calib/mu_c": 0.9618633540372672,
"calib/mu_w": 0.9502127659574466,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.32807843137254905,
"calib/std_conf": 0.06293764573812151,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.566830985915493,
"calib/step_q_c_n": 852.0,
"calib/step_q_gap": -0.009582948510736489,
"calib/step_q_w": 0.5764139344262295,
"calib/step_q_w_n": 488.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1450.0,
"completions/max_terminated_length": 1450.0,
"completions/mean_length": 526.67578125,
"completions/mean_terminated_length": 528.7412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.023936253041028976,
"kl": 0.031139373779296875,
"learning_rate": 4.777777777777778e-06,
"loss": -0.0249,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03148103132843971,
"mask/share_reasoning": 0.8523135781288147,
"mask/share_step_conf": 0.11229914426803589,
"num_tokens": 6594248.0,
"reward": 0.8912913799285889,
"reward_std": 0.15562722086906433,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.6597297191619873,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7978529930114746,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.74814772605896,
"adv/mean_abs_reasoning": 0.5354911088943481,
"adv/mean_abs_step_conf": 0.7605692148208618,
"adv/ratio_final_to_reasoning": 1.3971244594587073,
"adv/ratio_step_to_reasoning": 1.4203209020430652,
"adv/std_final_conf": 0.9092041850090027,
"adv/std_reasoning": 0.7754234671592712,
"adv/std_step_conf": 0.935039222240448,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.542939903234021,
"calib/avg_num_step_conf": 5.96484375,
"calib/ece": 0.439402390438247,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9800796812749004,
"calib/gap": 0.003364527629233449,
"calib/mean_conf": 0.9652988047808766,
"calib/mu_c": 0.9668939393939393,
"calib/mu_w": 0.9635294117647059,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.439402390438247,
"calib/std_conf": 0.021742113067573822,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5532034632034633,
"calib/step_q_c_n": 693.0,
"calib/step_q_gap": 0.05847924257996212,
"calib/step_q_w": 0.49472422062350113,
"calib/step_q_w_n": 834.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2380.0,
"completions/max_terminated_length": 2380.0,
"completions/mean_length": 574.78125,
"completions/mean_terminated_length": 574.78125,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.025617143139243126,
"kl": 0.033428192138671875,
"learning_rate": 4.75e-06,
"loss": 0.0279,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.029048418626189232,
"mask/share_reasoning": 0.854861855506897,
"mask/share_step_conf": 0.11608975380659103,
"num_tokens": 6848520.0,
"reward": 0.8259302377700806,
"reward_std": 0.22149044275283813,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5478870868682861,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8047546148300171,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7529537677764893,
"adv/mean_abs_reasoning": 0.5834614038467407,
"adv/mean_abs_step_conf": 0.7625214457511902,
"adv/ratio_final_to_reasoning": 1.2904945602439017,
"adv/ratio_step_to_reasoning": 1.3068926937136078,
"adv/std_final_conf": 0.9145447611808777,
"adv/std_reasoning": 0.7929871082305908,
"adv/std_step_conf": 0.9354909062385559,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.4917687074829932,
"calib/avg_num_step_conf": 6.015625,
"calib/ece": 0.35708502024291505,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9352226720647774,
"calib/gap": 0.011062585034013583,
"calib/mean_conf": 0.9519838056680162,
"calib/mu_c": 0.9564625850340136,
"calib/mu_w": 0.9454,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3569635627530365,
"calib/std_conf": 0.07052515372967161,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5104925544100801,
"calib/step_q_c_n": 873.0,
"calib/step_q_gap": 0.006054773300634864,
"calib/step_q_w": 0.5044377811094453,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2578.0,
"completions/max_terminated_length": 2578.0,
"completions/mean_length": 615.70703125,
"completions/mean_terminated_length": 618.12158203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.032,
"grad_norm": 0.019550230354070663,
"kl": 0.032825469970703125,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0398,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.028160445392131805,
"mask/share_reasoning": 0.854744553565979,
"mask/share_step_conf": 0.1131887435913086,
"num_tokens": 7113125.0,
"reward": 0.8469289541244507,
"reward_std": 0.23248617351055145,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6096968650817871,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7771298289299011,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7611095309257507,
"adv/mean_abs_reasoning": 0.4538407325744629,
"adv/mean_abs_step_conf": 0.7753937244415283,
"adv/ratio_final_to_reasoning": 1.6770410328933474,
"adv/ratio_step_to_reasoning": 1.7085150555857331,
"adv/std_final_conf": 0.9070990085601807,
"adv/std_reasoning": 0.7207038402557373,
"adv/std_step_conf": 0.9356343746185303,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.5348313117253264,
"calib/avg_num_step_conf": 6.21484375,
"calib/ece": 0.520217741935484,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.9516129032258065,
"calib/gap": 0.02418651819447204,
"calib/mean_conf": 0.9516693548387097,
"calib/mu_c": 0.9654205607476635,
"calib/mu_w": 0.9412340425531914,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.520217741935484,
"calib/std_conf": 0.10482598651084025,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5421069182389937,
"calib/step_q_c_n": 636.0,
"calib/step_q_gap": 0.036309710560110675,
"calib/step_q_w": 0.505797207678883,
"calib/step_q_w_n": 955.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2328.0,
"completions/max_terminated_length": 2328.0,
"completions/mean_length": 605.9765625,
"completions/mean_terminated_length": 608.3529663085938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.026389990001916885,
"kl": 0.029544830322265625,
"learning_rate": 4.694444444444445e-06,
"loss": 0.0218,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.02856561541557312,
"mask/share_reasoning": 0.8553394079208374,
"mask/share_step_conf": 0.11218871921300888,
"num_tokens": 7374167.0,
"reward": 0.7536863684654236,
"reward_std": 0.2233159840106964,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.4697951674461365,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7602337598800659,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7454090714454651,
"adv/mean_abs_reasoning": 0.44065701961517334,
"adv/mean_abs_step_conf": 0.75468909740448,
"adv/ratio_final_to_reasoning": 1.6915856057312608,
"adv/ratio_step_to_reasoning": 1.7126451271865621,
"adv/std_final_conf": 0.9135096669197083,
"adv/std_reasoning": 0.701328456401825,
"adv/std_step_conf": 0.9353702664375305,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5898970478789382,
"calib/avg_num_step_conf": 5.58984375,
"calib/ece": 0.40207843137254895,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9176470588235294,
"calib/gap": 0.035063259737038055,
"calib/mean_conf": 0.938078431372549,
"calib/mu_c": 0.9540287769784171,
"calib/mu_w": 0.9189655172413791,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.39752941176470585,
"calib/std_conf": 0.13279437785424603,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5582739726027397,
"calib/step_q_c_n": 730.0,
"calib/step_q_gap": 0.040970120962226186,
"calib/step_q_w": 0.5173038516405135,
"calib/step_q_w_n": 701.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1814.0,
"completions/max_terminated_length": 1814.0,
"completions/mean_length": 538.4921875,
"completions/mean_terminated_length": 538.4921875,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.03299793228507042,
"kl": 0.03919219970703125,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0164,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.029952574521303177,
"mask/share_reasoning": 0.8586723804473877,
"mask/share_step_conf": 0.11137507855892181,
"num_tokens": 7618725.0,
"reward": 0.8499400615692139,
"reward_std": 0.191944882273674,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5950214862823486,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7970460057258606,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7218849658966064,
"adv/mean_abs_reasoning": 0.4429064989089966,
"adv/mean_abs_step_conf": 0.7579190731048584,
"adv/ratio_final_to_reasoning": 1.6298811773473914,
"adv/ratio_step_to_reasoning": 1.7112394488945781,
"adv/std_final_conf": 0.8951165676116943,
"adv/std_reasoning": 0.7013714909553528,
"adv/std_step_conf": 0.9354180693626404,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5398657289002557,
"calib/avg_num_step_conf": 6.046875,
"calib/ece": 0.4180478087649402,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9482071713147411,
"calib/gap": 0.011603580562659865,
"calib/mean_conf": 0.9573306772908368,
"calib/mu_c": 0.9626470588235295,
"calib/mu_w": 0.9510434782608697,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4167729083665338,
"calib/std_conf": 0.05388165997824674,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5722263940520446,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.07556917407903507,
"calib/step_q_w": 0.4966572199730095,
"calib/step_q_w_n": 741.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2588.0,
"completions/max_terminated_length": 2588.0,
"completions/mean_length": 547.3359375,
"completions/mean_terminated_length": 549.4823608398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.0352,
"grad_norm": 0.020352911204099655,
"kl": 0.03536224365234375,
"learning_rate": 4.638888888888889e-06,
"loss": 0.0601,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030452530831098557,
"mask/share_reasoning": 0.8487410545349121,
"mask/share_step_conf": 0.11690014600753784,
"num_tokens": 7865715.0,
"reward": 0.8267344832420349,
"reward_std": 0.19006367027759552,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5706027746200562,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.780522346496582,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7223448157310486,
"adv/mean_abs_reasoning": 0.5534857511520386,
"adv/mean_abs_step_conf": 0.7550450563430786,
"adv/ratio_final_to_reasoning": 1.3050829478943997,
"adv/ratio_step_to_reasoning": 1.3641634943112984,
"adv/std_final_conf": 0.9112024903297424,
"adv/std_reasoning": 0.7928467392921448,
"adv/std_step_conf": 0.9351396560668945,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4595193340494093,
"calib/avg_num_step_conf": 6.12890625,
"calib/ece": 0.36218666666666666,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.896,
"calib/gap": -0.016228070175438702,
"calib/mean_conf": 0.9401333333333335,
"calib/mu_c": 0.9337719298245613,
"calib/mu_w": 0.95,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.34716,
"calib/std_conf": 0.11772578307235847,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.506442731277533,
"calib/step_q_c_n": 908.0,
"calib/step_q_gap": -0.001121565242890532,
"calib/step_q_w": 0.5075642965204236,
"calib/step_q_w_n": 661.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2178.0,
"completions/max_terminated_length": 2178.0,
"completions/mean_length": 510.0234375,
"completions/mean_terminated_length": 512.0235595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.019217276945710182,
"kl": 0.038822174072265625,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0494,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03184622898697853,
"mask/share_reasoning": 0.8305675387382507,
"mask/share_step_conf": 0.13367998600006104,
"num_tokens": 8101393.0,
"reward": 0.8644185066223145,
"reward_std": 0.21517950296401978,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6149966716766357,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7997777462005615,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7637768983840942,
"adv/mean_abs_reasoning": 0.4960789382457733,
"adv/mean_abs_step_conf": 0.7720634937286377,
"adv/ratio_final_to_reasoning": 1.539627747722873,
"adv/ratio_step_to_reasoning": 1.5563319347094169,
"adv/std_final_conf": 0.9017539024353027,
"adv/std_reasoning": 0.7394310832023621,
"adv/std_step_conf": 0.9350427985191345,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5493046776232617,
"calib/avg_num_step_conf": 5.20703125,
"calib/ece": 0.4012252964426877,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.8932806324110671,
"calib/gap": 0.020494943109987007,
"calib/mean_conf": 0.9387747035573122,
"calib/mu_c": 0.9479285714285712,
"calib/mu_w": 0.9274336283185842,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.39332015810276677,
"calib/std_conf": 0.11451427164975285,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5202301808066759,
"calib/step_q_c_n": 719.0,
"calib/step_q_gap": 0.057233438135666115,
"calib/step_q_w": 0.4629967426710098,
"calib/step_q_w_n": 614.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2079.0,
"completions/max_terminated_length": 2079.0,
"completions/mean_length": 565.484375,
"completions/mean_terminated_length": 565.484375,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.03530250862240791,
"kl": 0.03521728515625,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0291,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.028718896210193634,
"mask/share_reasoning": 0.871752142906189,
"mask/share_step_conf": 0.0995289534330368,
"num_tokens": 8355413.0,
"reward": 0.8562443256378174,
"reward_std": 0.22214269638061523,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.5939667820930481,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.812271773815155,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7034646272659302,
"adv/mean_abs_reasoning": 0.30368494987487793,
"adv/mean_abs_step_conf": 0.7625923156738281,
"adv/ratio_final_to_reasoning": 2.3164290082724435,
"adv/ratio_step_to_reasoning": 2.5111297612477204,
"adv/std_final_conf": 0.9043211340904236,
"adv/std_reasoning": 0.5960076451301575,
"adv/std_step_conf": 0.9348888397216797,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4618394513916902,
"calib/avg_num_step_conf": 5.8046875,
"calib/ece": 0.22792328042328053,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.9246031746031746,
"calib/gap": 0.008001882479494626,
"calib/mean_conf": 0.9431878306878307,
"calib/mu_c": 0.9453153153153154,
"calib/mu_w": 0.9373134328358208,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2184920634920636,
"calib/std_conf": 0.11297182458457421,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47746388384754984,
"calib/step_q_c_n": 1102.0,
"calib/step_q_gap": -0.0036298661524502007,
"calib/step_q_w": 0.48109375000000004,
"calib/step_q_w_n": 384.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2301.0,
"completions/max_terminated_length": 2301.0,
"completions/mean_length": 498.4765625,
"completions/mean_terminated_length": 500.431396484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.0384,
"grad_norm": 0.03462150692939758,
"kl": 0.042942047119140625,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0177,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03405049070715904,
"mask/share_reasoning": 0.829599142074585,
"mask/share_step_conf": 0.1324441134929657,
"num_tokens": 8585735.0,
"reward": 0.9496276378631592,
"reward_std": 0.14591683447360992,
"rewards/accuracy_reward_step": 0.72265625,
"rewards/final_brier_reward_step": 0.7397283315658569,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8181206583976746,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.7528542876243591,
"adv/mean_abs_reasoning": 0.4255194067955017,
"adv/mean_abs_step_conf": 0.7782937288284302,
"adv/ratio_final_to_reasoning": 1.7692595815874732,
"adv/ratio_step_to_reasoning": 1.8290440257228187,
"adv/std_final_conf": 0.9233216047286987,
"adv/std_reasoning": 0.6816492676734924,
"adv/std_step_conf": 0.9350104928016663,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5996240601503761,
"calib/avg_num_step_conf": 5.49609375,
"calib/ece": 0.4537349397590361,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8112449799196787,
"calib/gap": 0.043233082706766846,
"calib/mean_conf": 0.9069076305220883,
"calib/mu_c": 0.9299999999999999,
"calib/mu_w": 0.8867669172932331,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4473895582329317,
"calib/std_conf": 0.16586536945712516,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5145901639344262,
"calib/step_q_c_n": 549.0,
"calib/step_q_gap": 0.06427547861974092,
"calib/step_q_w": 0.4503146853146853,
"calib/step_q_w_n": 858.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2245.0,
"completions/max_terminated_length": 2245.0,
"completions/mean_length": 515.546875,
"completions/mean_terminated_length": 517.5686645507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.02837124653160572,
"kl": 0.040561676025390625,
"learning_rate": 4.527777777777778e-06,
"loss": 0.0259,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03237922489643097,
"mask/share_reasoning": 0.8477351069450378,
"mask/share_step_conf": 0.11597943305969238,
"num_tokens": 8824811.0,
"reward": 0.8100247383117676,
"reward_std": 0.19293108582496643,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.5355929732322693,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7993002533912659,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.7337380647659302,
"adv/mean_abs_reasoning": 0.3961666226387024,
"adv/mean_abs_step_conf": 0.7522833347320557,
"adv/ratio_final_to_reasoning": 1.85209460574645,
"adv/ratio_step_to_reasoning": 1.8989063988313977,
"adv/std_final_conf": 0.9243285655975342,
"adv/std_reasoning": 0.6816597580909729,
"adv/std_step_conf": 0.9352226257324219,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6171377641965877,
"calib/avg_num_step_conf": 5.65234375,
"calib/ece": 0.4039043824701196,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7689243027888446,
"calib/gap": 0.03243060860707914,
"calib/mean_conf": 0.9108366533864543,
"calib/mu_c": 0.9262121212121213,
"calib/mu_w": 0.8937815126050421,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3944223107569722,
"calib/std_conf": 0.1487092446801543,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.48603050397877984,
"calib/step_q_c_n": 754.0,
"calib/step_q_gap": 0.043750561698837565,
"calib/step_q_w": 0.4422799422799423,
"calib/step_q_w_n": 693.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2186.0,
"completions/max_terminated_length": 2186.0,
"completions/mean_length": 544.875,
"completions/mean_terminated_length": 544.875,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.022631574422121048,
"kl": 0.041919708251953125,
"learning_rate": 4.5e-06,
"loss": -0.0302,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030402235686779022,
"mask/share_reasoning": 0.8540716171264648,
"mask/share_step_conf": 0.11552612483501434,
"num_tokens": 9071187.0,
"reward": 0.8525607585906982,
"reward_std": 0.19134631752967834,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.5848976373672485,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.82100510597229,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7698835730552673,
"adv/mean_abs_reasoning": 0.41834837198257446,
"adv/mean_abs_step_conf": 0.7499821186065674,
"adv/ratio_final_to_reasoning": 1.840292982154441,
"adv/ratio_step_to_reasoning": 1.7927214944147232,
"adv/std_final_conf": 0.9209132790565491,
"adv/std_reasoning": 0.6816251873970032,
"adv/std_step_conf": 0.9349772930145264,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6363636363636364,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.4293307086614172,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7401574803149606,
"calib/gap": 0.05593641331346233,
"calib/mean_conf": 0.8942913385826772,
"calib/mu_c": 0.9233606557377048,
"calib/mu_w": 0.8674242424242424,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4216535433070865,
"calib/std_conf": 0.16674539081501089,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47681818181818186,
"calib/step_q_c_n": 660.0,
"calib/step_q_gap": 0.025802650157488893,
"calib/step_q_w": 0.45101553166069297,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 512.64453125,
"completions/mean_terminated_length": 512.64453125,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.0416,
"grad_norm": 0.027761587873101234,
"kl": 0.03902435302734375,
"learning_rate": 4.472222222222223e-06,
"loss": 0.0511,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03315240889787674,
"mask/share_reasoning": 0.8416691422462463,
"mask/share_step_conf": 0.12517844140529633,
"num_tokens": 9308512.0,
"reward": 0.8408872485160828,
"reward_std": 0.19443252682685852,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.569301187992096,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8202857375144958,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7725449800491333,
"adv/mean_abs_reasoning": 0.5383433103561401,
"adv/mean_abs_step_conf": 0.7697858810424805,
"adv/ratio_final_to_reasoning": 1.435041478528,
"adv/ratio_step_to_reasoning": 1.4299163122009073,
"adv/std_final_conf": 0.935536801815033,
"adv/std_reasoning": 0.7927297353744507,
"adv/std_step_conf": 0.9351649284362793,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5562437437437437,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.43980392156862746,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.6745098039215687,
"calib/gap": 0.04463213213213213,
"calib/mean_conf": 0.8680392156862745,
"calib/mu_c": 0.8932432432432431,
"calib/mu_w": 0.848611111111111,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4362745098039216,
"calib/std_conf": 0.19242130663324078,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4874868421052631,
"calib/step_q_c_n": 532.0,
"calib/step_q_gap": 0.04840693895756332,
"calib/step_q_w": 0.4390799031476998,
"calib/step_q_w_n": 826.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1679.0,
"completions/max_terminated_length": 1679.0,
"completions/mean_length": 529.17578125,
"completions/mean_terminated_length": 531.2510375976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.039750393480062485,
"kl": 0.042789459228515625,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0242,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.032181888818740845,
"mask/share_reasoning": 0.852412223815918,
"mask/share_step_conf": 0.11149965226650238,
"num_tokens": 9550741.0,
"reward": 0.8274902701377869,
"reward_std": 0.20201276242733002,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.5496792793273926,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8193637132644653,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.7592229843139648,
"adv/mean_abs_reasoning": 0.47157496213912964,
"adv/mean_abs_step_conf": 0.7730763554573059,
"adv/ratio_final_to_reasoning": 1.6099730589385488,
"adv/ratio_step_to_reasoning": 1.6393498754694777,
"adv/std_final_conf": 0.9329091310501099,
"adv/std_reasoning": 0.7392388582229614,
"adv/std_step_conf": 0.9349347949028015,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.562404999155548,
"calib/avg_num_step_conf": 5.12890625,
"calib/ece": 0.20612648221343866,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6521739130434783,
"calib/gap": 0.00318780611383207,
"calib/mean_conf": 0.8506324110671936,
"calib/mu_c": 0.851413612565445,
"calib/mu_w": 0.8482258064516129,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.15090909090909085,
"calib/std_conf": 0.21578149918890283,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4662576687116564,
"calib/step_q_c_n": 978.0,
"calib/step_q_gap": 0.00951140005494,
"calib/step_q_w": 0.4567462686567164,
"calib/step_q_w_n": 335.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1341.0,
"completions/max_terminated_length": 1341.0,
"completions/mean_length": 468.28125,
"completions/mean_terminated_length": 470.11767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.05282951146364212,
"kl": 0.04245758056640625,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0053,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.035104911774396896,
"mask/share_reasoning": 0.8374688625335693,
"mask/share_step_conf": 0.12351995706558228,
"num_tokens": 9777869.0,
"reward": 0.9565410614013672,
"reward_std": 0.19869878888130188,
"rewards/accuracy_reward_step": 0.75,
"rewards/final_brier_reward_step": 0.7464367151260376,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8205516338348389,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7502855658531189,
"adv/mean_abs_reasoning": 0.308381050825119,
"adv/mean_abs_step_conf": 0.7523162364959717,
"adv/ratio_final_to_reasoning": 2.4329820650316196,
"adv/ratio_step_to_reasoning": 2.439567004791762,
"adv/std_final_conf": 0.9343463778495789,
"adv/std_reasoning": 0.5959736108779907,
"adv/std_step_conf": 0.934657096862793,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.590472027972028,
"calib/avg_num_step_conf": 5.7421875,
"calib/ece": 0.32019607843137265,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.615686274509804,
"calib/gap": 0.07060939060939053,
"calib/mean_conf": 0.8238823529411764,
"calib/mu_c": 0.8548951048951049,
"calib/mu_w": 0.7842857142857144,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.29164705882352954,
"calib/std_conf": 0.24580764415264922,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4692866407263294,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.0379847809266155,
"calib/step_q_w": 0.4313018597997139,
"calib/step_q_w_n": 699.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1702.0,
"completions/max_terminated_length": 1702.0,
"completions/mean_length": 450.12890625,
"completions/mean_terminated_length": 451.8941345214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.0448,
"grad_norm": 0.023945538327097893,
"kl": 0.0476837158203125,
"learning_rate": 4.388888888888889e-06,
"loss": -0.0103,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.035657431930303574,
"mask/share_reasoning": 0.827800989151001,
"mask/share_step_conf": 0.13263539969921112,
"num_tokens": 9997470.0,
"reward": 0.896149754524231,
"reward_std": 0.16470317542552948,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6523573994636536,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8297858238220215,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.7824147939682007,
"adv/mean_abs_reasoning": 0.5487573146820068,
"adv/mean_abs_step_conf": 0.7633628845214844,
"adv/ratio_final_to_reasoning": 1.4257938309607652,
"adv/ratio_step_to_reasoning": 1.3910755521570348,
"adv/std_final_conf": 0.932792067527771,
"adv/std_reasoning": 0.7754303812980652,
"adv/std_step_conf": 0.9347939491271973,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.60501389239707,
"calib/avg_num_step_conf": 5.3515625,
"calib/ece": 0.27352941176470585,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5098039215686274,
"calib/gap": 0.0822448850719879,
"calib/mean_conf": 0.8023137254901961,
"calib/mu_c": 0.8368243243243243,
"calib/mu_w": 0.7545794392523364,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24772549019607842,
"calib/std_conf": 0.23805531489312692,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4695673671199011,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": 0.04070818708425045,
"calib/step_q_w": 0.42885918003565066,
"calib/step_q_w_n": 561.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1876.0,
"completions/max_terminated_length": 1876.0,
"completions/mean_length": 489.28125,
"completions/mean_terminated_length": 489.28125,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.025007378309965134,
"kl": 0.042606353759765625,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0209,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03451922535896301,
"mask/share_reasoning": 0.8459039926528931,
"mask/share_step_conf": 0.11957676708698273,
"num_tokens": 10227950.0,
"reward": 0.9199116230010986,
"reward_std": 0.18631719052791595,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.6879050731658936,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8370743989944458,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.7647424340248108,
"adv/mean_abs_reasoning": 0.4228684604167938,
"adv/mean_abs_step_conf": 0.7544612884521484,
"adv/ratio_final_to_reasoning": 1.8084641102603256,
"adv/ratio_step_to_reasoning": 1.7841512410467435,
"adv/std_final_conf": 0.9265362024307251,
"adv/std_reasoning": 0.7205361127853394,
"adv/std_step_conf": 0.9344058632850647,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6182170542635659,
"calib/avg_num_step_conf": 5.703125,
"calib/ece": 0.3047011952191234,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5219123505976095,
"calib/gap": 0.09971470326598042,
"calib/mean_conf": 0.7993625498007969,
"calib/mu_c": 0.8478294573643411,
"calib/mu_w": 0.7481147540983607,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.29505976095617525,
"calib/std_conf": 0.24303278030476025,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4634710743801653,
"calib/step_q_c_n": 726.0,
"calib/step_q_gap": 0.03848469835836699,
"calib/step_q_w": 0.42498637602179834,
"calib/step_q_w_n": 734.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2194.0,
"completions/max_terminated_length": 2194.0,
"completions/mean_length": 531.890625,
"completions/mean_terminated_length": 531.890625,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.030400488525629044,
"kl": 0.0402679443359375,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0253,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.031088586896657944,
"mask/share_reasoning": 0.8493733406066895,
"mask/share_step_conf": 0.1195380911231041,
"num_tokens": 10470434.0,
"reward": 0.8869525194168091,
"reward_std": 0.18943586945533752,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6466039419174194,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8296449184417725,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.7844476699829102,
"adv/mean_abs_reasoning": 0.5681071281433105,
"adv/mean_abs_step_conf": 0.7434054017066956,
"adv/ratio_final_to_reasoning": 1.3808094127363697,
"adv/ratio_step_to_reasoning": 1.3085655237882605,
"adv/std_final_conf": 0.9305549263954163,
"adv/std_reasoning": 0.7928244471549988,
"adv/std_step_conf": 0.9343764781951904,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6013617677286742,
"calib/avg_num_step_conf": 5.8359375,
"calib/ece": 0.2962948207171314,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.47808764940239046,
"calib/gap": 0.08765994347379247,
"calib/mean_conf": 0.7457768924302789,
"calib/mu_c": 0.7848920863309353,
"calib/mu_w": 0.6972321428571429,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2441434262948207,
"calib/std_conf": 0.29521903210368977,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.461213282247765,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.056431285060704495,
"calib/step_q_w": 0.4047819971870605,
"calib/step_q_w_n": 711.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2018.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 510.1484375,
"completions/mean_terminated_length": 510.1484375,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.048,
"grad_norm": 0.031916260719299316,
"kl": 0.044345855712890625,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0493,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03434551879763603,
"mask/share_reasoning": 0.835544228553772,
"mask/share_step_conf": 0.13011020421981812,
"num_tokens": 10706080.0,
"reward": 0.89674973487854,
"reward_std": 0.20731501281261444,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6590714454650879,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8297404646873474,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.7703957557678223,
"adv/mean_abs_reasoning": 0.4477524757385254,
"adv/mean_abs_step_conf": 0.7582173943519592,
"adv/ratio_final_to_reasoning": 1.7205840224492948,
"adv/ratio_step_to_reasoning": 1.6933851523687307,
"adv/std_final_conf": 0.9339072108268738,
"adv/std_reasoning": 0.7014877200126648,
"adv/std_step_conf": 0.9343469738960266,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5816142921406079,
"calib/avg_num_step_conf": 5.95703125,
"calib/ece": 0.30355999999999994,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4,
"calib/gap": 0.049293104556262346,
"calib/mean_conf": 0.7646000000000002,
"calib/mu_c": 0.7876691729323307,
"calib/mu_w": 0.7383760683760684,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.26808,
"calib/std_conf": 0.2485398157237588,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.41774371727748694,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": -0.0005611447461661978,
"calib/step_q_w": 0.41830486202365313,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2455.0,
"completions/max_terminated_length": 2455.0,
"completions/mean_length": 541.796875,
"completions/mean_terminated_length": 541.796875,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.023222772404551506,
"kl": 0.0390472412109375,
"learning_rate": 4.277777777777778e-06,
"loss": -0.002,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03602005913853645,
"mask/share_reasoning": 0.8326444625854492,
"mask/share_step_conf": 0.13133545219898224,
"num_tokens": 10949548.0,
"reward": 0.8750042915344238,
"reward_std": 0.18136216700077057,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.640655517578125,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8109156489372253,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.7664064764976501,
"adv/mean_abs_reasoning": 0.4121689200401306,
"adv/mean_abs_step_conf": 0.7210854291915894,
"adv/ratio_final_to_reasoning": 1.8594475207471475,
"adv/ratio_step_to_reasoning": 1.7494900613112245,
"adv/std_final_conf": 0.9348024725914001,
"adv/std_reasoning": 0.6817787289619446,
"adv/std_step_conf": 0.9342029094696045,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6607632093933463,
"calib/avg_num_step_conf": 6.2109375,
"calib/ece": 0.18179282868525892,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.3745019920318725,
"calib/gap": 0.1404827136333986,
"calib/mean_conf": 0.7221912350597609,
"calib/mu_c": 0.780958904109589,
"calib/mu_w": 0.6404761904761904,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16115537848605574,
"calib/std_conf": 0.25993176258025147,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.42186956521739133,
"calib/step_q_c_n": 920.0,
"calib/step_q_gap": 0.021048669695003352,
"calib/step_q_w": 0.400820895522388,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2286.0,
"completions/max_terminated_length": 2286.0,
"completions/mean_length": 537.7421875,
"completions/mean_terminated_length": 539.8510131835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.0441867858171463,
"kl": 0.057865142822265625,
"learning_rate": 4.25e-06,
"loss": -0.1043,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.031595584005117416,
"mask/share_reasoning": 0.840315043926239,
"mask/share_step_conf": 0.12418308854103088,
"num_tokens": 11193186.0,
"reward": 0.9337431788444519,
"reward_std": 0.1738939881324768,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7196019887924194,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8385093212127686,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.7661705017089844,
"adv/mean_abs_reasoning": 0.5280641913414001,
"adv/mean_abs_step_conf": 0.7433052062988281,
"adv/ratio_final_to_reasoning": 1.4509041027052818,
"adv/ratio_step_to_reasoning": 1.407603883176907,
"adv/std_final_conf": 0.9345859289169312,
"adv/std_reasoning": 0.7753331065177917,
"adv/std_step_conf": 0.934105396270752,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.609538002980626,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.25224409448818896,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.3346456692913386,
"calib/gap": 0.09073273720814712,
"calib/mean_conf": 0.6824803149606299,
"calib/mu_c": 0.7260606060606061,
"calib/mu_w": 0.635327868852459,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20751968503937007,
"calib/std_conf": 0.284834095941698,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4449210903873745,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.036738030278084866,
"calib/step_q_w": 0.4081830601092896,
"calib/step_q_w_n": 732.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1862.0,
"completions/max_terminated_length": 1862.0,
"completions/mean_length": 485.16796875,
"completions/mean_terminated_length": 487.07061767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.0512,
"grad_norm": 0.028268778696656227,
"kl": 0.05120849609375,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0433,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034794606268405914,
"mask/share_reasoning": 0.8305359482765198,
"mask/share_step_conf": 0.1307632029056549,
"num_tokens": 11421077.0,
"reward": 0.9115187525749207,
"reward_std": 0.16462844610214233,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6826753616333008,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8380183577537537,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7610164880752563,
"adv/mean_abs_reasoning": 0.37937378883361816,
"adv/mean_abs_step_conf": 0.7463734149932861,
"adv/ratio_final_to_reasoning": 2.0059806725577847,
"adv/ratio_step_to_reasoning": 1.9673826631196782,
"adv/std_final_conf": 0.927111029624939,
"adv/std_reasoning": 0.6403971314430237,
"adv/std_step_conf": 0.9335190057754517,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6618327886710238,
"calib/avg_num_step_conf": 5.7890625,
"calib/ece": 0.1867469879518073,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.39759036144578314,
"calib/gap": 0.14107434640522876,
"calib/mean_conf": 0.7339759036144577,
"calib/mu_c": 0.7883660130718955,
"calib/mu_w": 0.6472916666666667,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.153132530120482,
"calib/std_conf": 0.26491768711331565,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4299315849486887,
"calib/step_q_c_n": 877.0,
"calib/step_q_gap": 0.025981171725548258,
"calib/step_q_w": 0.40395041322314046,
"calib/step_q_w_n": 605.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1952.0,
"completions/max_terminated_length": 1952.0,
"completions/mean_length": 500.1640625,
"completions/mean_terminated_length": 500.1640625,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.034990474581718445,
"kl": 0.04461669921875,
"learning_rate": 4.194444444444445e-06,
"loss": 0.0257,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03338824212551117,
"mask/share_reasoning": 0.8339203000068665,
"mask/share_step_conf": 0.13269150257110596,
"num_tokens": 11653655.0,
"reward": 0.9292441010475159,
"reward_std": 0.16790857911109924,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7214101552963257,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8237967491149902,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7604638338088989,
"adv/mean_abs_reasoning": 0.4985535740852356,
"adv/mean_abs_step_conf": 0.7410811185836792,
"adv/ratio_final_to_reasoning": 1.5253402509534224,
"adv/ratio_step_to_reasoning": 1.4864623525033234,
"adv/std_final_conf": 0.9323478937149048,
"adv/std_reasoning": 0.7394198775291443,
"adv/std_step_conf": 0.9342080354690552,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.696236559139785,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.16378486055776886,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2868525896414343,
"calib/gap": 0.17506669388866214,
"calib/mean_conf": 0.6755776892430279,
"calib/mu_c": 0.7404430379746836,
"calib/mu_w": 0.5653763440860214,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10494023904382463,
"calib/std_conf": 0.2817333318214269,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4413221153846154,
"calib/step_q_c_n": 832.0,
"calib/step_q_gap": 0.0021435439560439917,
"calib/step_q_w": 0.4391785714285714,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2354.0,
"completions/max_terminated_length": 2354.0,
"completions/mean_length": 502.4453125,
"completions/mean_terminated_length": 504.41571044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.033615924417972565,
"kl": 0.0509490966796875,
"learning_rate": 4.166666666666667e-06,
"loss": -0.0329,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03367741405963898,
"mask/share_reasoning": 0.8400186896324158,
"mask/share_step_conf": 0.12239763140678406,
"num_tokens": 11887641.0,
"reward": 0.953680157661438,
"reward_std": 0.16476009786128998,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7499589920043945,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8386512994766235,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.7251266837120056,
"adv/mean_abs_reasoning": 0.48975202441215515,
"adv/mean_abs_step_conf": 0.7647483348846436,
"adv/ratio_final_to_reasoning": 1.4805996658867688,
"adv/ratio_step_to_reasoning": 1.5615011204957527,
"adv/std_final_conf": 0.9339321851730347,
"adv/std_reasoning": 0.7574661374092102,
"adv/std_step_conf": 0.9339545965194702,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7184901247401246,
"calib/avg_num_step_conf": 5.75,
"calib/ece": 0.1662698412698413,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.4087301587301587,
"calib/gap": 0.2079625779625781,
"calib/mean_conf": 0.7244444444444444,
"calib/mu_c": 0.8102702702702703,
"calib/mu_w": 0.6023076923076922,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.15170634920634923,
"calib/std_conf": 0.27266339372688003,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4352357320099255,
"calib/step_q_c_n": 806.0,
"calib/step_q_gap": 0.036962458736652215,
"calib/step_q_w": 0.3982732732732733,
"calib/step_q_w_n": 666.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2251.0,
"completions/max_terminated_length": 2251.0,
"completions/mean_length": 527.70703125,
"completions/mean_terminated_length": 527.70703125,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.0544,
"grad_norm": 0.03140642121434212,
"kl": 0.047603607177734375,
"learning_rate": 4.138888888888889e-06,
"loss": 0.0705,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03250539302825928,
"mask/share_reasoning": 0.8480877876281738,
"mask/share_step_conf": 0.11940683424472809,
"num_tokens": 12132030.0,
"reward": 0.9555783867835999,
"reward_std": 0.16661688685417175,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7533218860626221,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8453348875045776,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.7324730157852173,
"adv/mean_abs_reasoning": 0.37932929396629333,
"adv/mean_abs_step_conf": 0.7724111080169678,
"adv/ratio_final_to_reasoning": 1.930968758374627,
"adv/ratio_step_to_reasoning": 2.036254832682664,
"adv/std_final_conf": 0.9282681345939636,
"adv/std_reasoning": 0.6612043976783752,
"adv/std_step_conf": 0.9335633516311646,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7267468944099378,
"calib/avg_num_step_conf": 5.2734375,
"calib/ece": 0.11173228346456689,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.37401574803149606,
"calib/gap": 0.23133385093167713,
"calib/mean_conf": 0.6908661417322836,
"calib/mu_c": 0.7546195652173913,
"calib/mu_w": 0.5232857142857141,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.039094488188976304,
"calib/std_conf": 0.284220230740669,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.45716188524590157,
"calib/step_q_c_n": 976.0,
"calib/step_q_gap": 0.062108409310072665,
"calib/step_q_w": 0.3950534759358289,
"calib/step_q_w_n": 374.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1867.0,
"completions/max_terminated_length": 1867.0,
"completions/mean_length": 483.15625,
"completions/mean_terminated_length": 483.15625,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.07565945386886597,
"kl": 0.04888153076171875,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0458,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03548673540353775,
"mask/share_reasoning": 0.8450421094894409,
"mask/share_step_conf": 0.11947111040353775,
"num_tokens": 12363670.0,
"reward": 1.001354455947876,
"reward_std": 0.14560005068778992,
"rewards/accuracy_reward_step": 0.71875,
"rewards/final_brier_reward_step": 0.8012363314628601,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8600664734840393,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7319698929786682,
"adv/mean_abs_reasoning": 0.42775505781173706,
"adv/mean_abs_step_conf": 0.7573376893997192,
"adv/ratio_final_to_reasoning": 1.7111893351377314,
"adv/ratio_step_to_reasoning": 1.7704938271776967,
"adv/std_final_conf": 0.9073989391326904,
"adv/std_reasoning": 0.7013720273971558,
"adv/std_step_conf": 0.9335355162620544,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6764725195350975,
"calib/avg_num_step_conf": 5.90625,
"calib/ece": 0.22283464566929134,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5118110236220472,
"calib/gap": 0.14950554862433507,
"calib/mean_conf": 0.7622047244094489,
"calib/mu_c": 0.8192993630573248,
"calib/mu_w": 0.6697938144329897,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18346456692913385,
"calib/std_conf": 0.27814320614979304,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4495724713242961,
"calib/step_q_c_n": 959.0,
"calib/step_q_gap": 0.028577896279088133,
"calib/step_q_w": 0.420994575045208,
"calib/step_q_w_n": 553.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1899.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 514.00390625,
"completions/mean_terminated_length": 514.00390625,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.03649696707725525,
"kl": 0.044811248779296875,
"learning_rate": 4.083333333333334e-06,
"loss": -0.0379,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.0319090262055397,
"mask/share_reasoning": 0.8450208306312561,
"mask/share_step_conf": 0.12307015061378479,
"num_tokens": 12601079.0,
"reward": 0.9459646940231323,
"reward_std": 0.1464155912399292,
"rewards/accuracy_reward_step": 0.61328125,
"rewards/final_brier_reward_step": 0.7299094200134277,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8417074680328369,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.6466980576515198,
"adv/mean_abs_reasoning": 0.41618654131889343,
"adv/mean_abs_step_conf": 0.7486047744750977,
"adv/ratio_final_to_reasoning": 1.5538658592902508,
"adv/ratio_step_to_reasoning": 1.7987241300566141,
"adv/std_final_conf": 0.8837612867355347,
"adv/std_reasoning": 0.7204023599624634,
"adv/std_step_conf": 0.9337106347084045,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7693302779420461,
"calib/avg_num_step_conf": 5.703125,
"calib/ece": 0.17437007874015756,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7125984251968503,
"calib/gap": 0.2069544648137196,
"calib/mean_conf": 0.8731889763779528,
"calib/mu_c": 0.9351123595505617,
"calib/mu_w": 0.7281578947368421,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.17338582677165362,
"calib/std_conf": 0.21359794567870916,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4733155080213904,
"calib/step_q_c_n": 935.0,
"calib/step_q_gap": 0.06228693659281892,
"calib/step_q_w": 0.41102857142857147,
"calib/step_q_w_n": 525.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2332.0,
"completions/max_terminated_length": 2332.0,
"completions/mean_length": 472.18359375,
"completions/mean_terminated_length": 472.18359375,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.0576,
"grad_norm": 0.03446084260940552,
"kl": 0.042400360107421875,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0397,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03629143908619881,
"mask/share_reasoning": 0.832728922367096,
"mask/share_step_conf": 0.13097967207431793,
"num_tokens": 12828190.0,
"reward": 0.9863956570625305,
"reward_std": 0.1475645750761032,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.7954957485198975,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8397955894470215,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.710288405418396,
"adv/mean_abs_reasoning": 0.4166402816772461,
"adv/mean_abs_step_conf": 0.7546051740646362,
"adv/ratio_final_to_reasoning": 1.7048001277241525,
"adv/ratio_step_to_reasoning": 1.8111671080550908,
"adv/std_final_conf": 0.8620692491531372,
"adv/std_reasoning": 0.681664228439331,
"adv/std_step_conf": 0.9338399171829224,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.831180984465656,
"calib/avg_num_step_conf": 5.15625,
"calib/ece": 0.3065354330708662,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6692913385826772,
"calib/gap": 0.2554569842161083,
"calib/mean_conf": 0.8459055118110237,
"calib/mu_c": 0.9635766423357663,
"calib/mu_w": 0.708119658119658,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3065354330708662,
"calib/std_conf": 0.23156555280354305,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.49875362318840577,
"calib/step_q_c_n": 690.0,
"calib/step_q_gap": 0.06803933747412,
"calib/step_q_w": 0.43071428571428577,
"calib/step_q_w_n": 630.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1627.0,
"completions/max_terminated_length": 1627.0,
"completions/mean_length": 471.10546875,
"completions/mean_terminated_length": 472.9529724121094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.0477132648229599,
"kl": 0.042133331298828125,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0111,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034928515553474426,
"mask/share_reasoning": 0.8399760723114014,
"mask/share_step_conf": 0.12118920683860779,
"num_tokens": 13056617.0,
"reward": 0.9348831176757812,
"reward_std": 0.1851484775543213,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.717585563659668,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8482744097709656,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.6221961975097656,
"adv/mean_abs_reasoning": 0.44493114948272705,
"adv/mean_abs_step_conf": 0.7350568771362305,
"adv/ratio_final_to_reasoning": 1.3984100646428674,
"adv/ratio_step_to_reasoning": 1.6520688155702314,
"adv/std_final_conf": 0.8303307890892029,
"adv/std_reasoning": 0.7205959558486938,
"adv/std_step_conf": 0.9340452551841736,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.760609149993628,
"calib/avg_num_step_conf": 6.0078125,
"calib/ece": 0.35442231075697206,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.7370517928286853,
"calib/gap": 0.1552262010959603,
"calib/mean_conf": 0.8819123505976096,
"calib/mu_c": 0.9548872180451129,
"calib/mu_w": 0.7996610169491526,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3532270916334661,
"calib/std_conf": 0.2205596112582244,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.49531598513011155,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.05866755831752596,
"calib/step_q_w": 0.4366484268125856,
"calib/step_q_w_n": 731.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2173.0,
"completions/max_terminated_length": 2173.0,
"completions/mean_length": 518.59375,
"completions/mean_terminated_length": 522.6771850585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.027666104957461357,
"kl": 0.041835784912109375,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0158,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03235214948654175,
"mask/share_reasoning": 0.8360726833343506,
"mask/share_step_conf": 0.12376265227794647,
"num_tokens": 13296217.0,
"reward": 0.8819053173065186,
"reward_std": 0.1791996955871582,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6354690790176392,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8299038410186768,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.6285187005996704,
"adv/mean_abs_reasoning": 0.4043678939342499,
"adv/mean_abs_step_conf": 0.7555187940597534,
"adv/ratio_final_to_reasoning": 1.5543239461584637,
"adv/ratio_step_to_reasoning": 1.86839461142432,
"adv/std_final_conf": 0.8306865096092224,
"adv/std_reasoning": 0.6815210580825806,
"adv/std_step_conf": 0.9339905381202698,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7224432009708084,
"calib/avg_num_step_conf": 5.40234375,
"calib/ece": 0.2949602362204724,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.8110236220472441,
"calib/gap": 0.1101253354007955,
"calib/mean_conf": 0.9085830708661418,
"calib/mu_c": 0.9480374233128834,
"calib/mu_w": 0.8379120879120879,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2809055118110235,
"calib/std_conf": 0.20481708814279678,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5285098952270082,
"calib/step_q_c_n": 859.0,
"calib/step_q_gap": 0.0643495898834967,
"calib/step_q_w": 0.4641603053435115,
"calib/step_q_w_n": 524.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2337.0,
"completions/max_terminated_length": 2337.0,
"completions/mean_length": 502.36328125,
"completions/mean_terminated_length": 502.36328125,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.0608,
"grad_norm": 0.03946515545248985,
"kl": 0.042720794677734375,
"learning_rate": 3.972222222222223e-06,
"loss": -0.0043,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03254619985818863,
"mask/share_reasoning": 0.8481278419494629,
"mask/share_step_conf": 0.11932602524757385,
"num_tokens": 13531614.0,
"reward": 0.9359113574028015,
"reward_std": 0.17461207509040833,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7020390629768372,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8432211875915527,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.702364444732666,
"adv/mean_abs_reasoning": 0.6132348775863647,
"adv/mean_abs_step_conf": 0.7652078866958618,
"adv/ratio_final_to_reasoning": 1.14534327776187,
"adv/ratio_step_to_reasoning": 1.2478218618413366,
"adv/std_final_conf": 0.8918449878692627,
"adv/std_reasoning": 0.8266597390174866,
"adv/std_step_conf": 0.9347798824310303,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5812231404958678,
"calib/avg_num_step_conf": 6.26171875,
"calib/ece": 0.3894715447154472,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7520325203252033,
"calib/gap": 0.08071999999999979,
"calib/mean_conf": 0.8710162601626017,
"calib/mu_c": 0.9107199999999999,
"calib/mu_w": 0.8300000000000001,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.3761788617886179,
"calib/std_conf": 0.2487010640262768,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5189918256130791,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": 0.09607352872009867,
"calib/step_q_w": 0.42291829689298044,
"calib/step_q_w_n": 869.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2473.0,
"completions/max_terminated_length": 2473.0,
"completions/mean_length": 612.94921875,
"completions/mean_terminated_length": 612.94921875,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.03811941668391228,
"kl": 0.03516387939453125,
"learning_rate": 3.944444444444445e-06,
"loss": 0.0669,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.02849685214459896,
"mask/share_reasoning": 0.8575760722160339,
"mask/share_step_conf": 0.11392708867788315,
"num_tokens": 13794849.0,
"reward": 0.8243527412414551,
"reward_std": 0.2575843036174774,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5681480169296265,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.793057382106781,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.6952832937240601,
"adv/mean_abs_reasoning": 0.5429081916809082,
"adv/mean_abs_step_conf": 0.7698467969894409,
"adv/ratio_final_to_reasoning": 1.2806645845062319,
"adv/ratio_step_to_reasoning": 1.4180054911418887,
"adv/std_final_conf": 0.8595199584960938,
"adv/std_reasoning": 0.7577628493309021,
"adv/std_step_conf": 0.9347355961799622,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6182361439453639,
"calib/avg_num_step_conf": 5.3984375,
"calib/ece": 0.36128514056224903,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8192771084337349,
"calib/gap": 0.07229905437352246,
"calib/mean_conf": 0.9089959839357429,
"calib/mu_c": 0.940354609929078,
"calib/mu_w": 0.8680555555555556,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3520080321285141,
"calib/std_conf": 0.2067461532203717,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5530294511378849,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.039438899956782625,
"calib/step_q_w": 0.5135905511811023,
"calib/step_q_w_n": 635.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2113.0,
"completions/max_terminated_length": 2113.0,
"completions/mean_length": 560.06640625,
"completions/mean_terminated_length": 560.06640625,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.048179443925619125,
"kl": 0.040973663330078125,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0553,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032195284962654114,
"mask/share_reasoning": 0.8571901321411133,
"mask/share_step_conf": 0.11061456054449081,
"num_tokens": 14044474.0,
"reward": 0.8571747541427612,
"reward_std": 0.24229061603546143,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6124788522720337,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.796401858329773,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.6338130235671997,
"adv/mean_abs_reasoning": 0.5311764478683472,
"adv/mean_abs_step_conf": 0.7529253959655762,
"adv/ratio_final_to_reasoning": 1.1932250123489871,
"adv/ratio_step_to_reasoning": 1.417467583487794,
"adv/std_final_conf": 0.8211374878883362,
"adv/std_reasoning": 0.7754148840904236,
"adv/std_step_conf": 0.9346369504928589,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7307056579783853,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.3291304347826087,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.766798418972332,
"calib/gap": 0.16638461538461535,
"calib/mean_conf": 0.8721343873517787,
"calib/mu_c": 0.9444755244755245,
"calib/mu_w": 0.7780909090909092,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3180237154150198,
"calib/std_conf": 0.25232239424135916,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5669808541973491,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.07766474477485669,
"calib/step_q_w": 0.48931610942249243,
"calib/step_q_w_n": 658.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2400.0,
"completions/max_terminated_length": 2400.0,
"completions/mean_length": 517.28515625,
"completions/mean_terminated_length": 517.28515625,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.064,
"grad_norm": 0.02972288429737091,
"kl": 0.043041229248046875,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0674,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0328441746532917,
"mask/share_reasoning": 0.8521276712417603,
"mask/share_step_conf": 0.11502814292907715,
"num_tokens": 14285755.0,
"reward": 0.9059640169143677,
"reward_std": 0.21950387954711914,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6663120985031128,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.837022066116333,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.5326590538024902,
"adv/mean_abs_reasoning": 0.4154004752635956,
"adv/mean_abs_step_conf": 0.7765494585037231,
"adv/ratio_final_to_reasoning": 1.2822783928316097,
"adv/ratio_step_to_reasoning": 1.8693995427206906,
"adv/std_final_conf": 0.7565453052520752,
"adv/std_reasoning": 0.6815720200538635,
"adv/std_step_conf": 0.9333240389823914,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5917389020225899,
"calib/avg_num_step_conf": 5.38671875,
"calib/ece": 0.3258984375,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.8984375,
"calib/gap": 0.0783412135539796,
"calib/mean_conf": 0.9430859375,
"calib/mu_c": 0.9718518518518519,
"calib/mu_w": 0.8935106382978723,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.3180859375,
"calib/std_conf": 0.17727607033874057,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5699270072992701,
"calib/step_q_c_n": 822.0,
"calib/step_q_gap": 0.04366129814307629,
"calib/step_q_w": 0.5262657091561939,
"calib/step_q_w_n": 557.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1210.0,
"completions/max_terminated_length": 1210.0,
"completions/mean_length": 433.859375,
"completions/mean_terminated_length": 435.5608215332031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.033378370106220245,
"kl": 0.0516357421875,
"learning_rate": 3.861111111111112e-06,
"loss": -0.0017,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.0384836420416832,
"mask/share_reasoning": 0.8277353048324585,
"mask/share_step_conf": 0.1298747956752777,
"num_tokens": 14500887.0,
"reward": 0.9083299040794373,
"reward_std": 0.1809770166873932,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6727949380874634,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8180835247039795,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.6663320064544678,
"adv/mean_abs_reasoning": 0.5563945770263672,
"adv/mean_abs_step_conf": 0.7701988220214844,
"adv/ratio_final_to_reasoning": 1.1975889664770956,
"adv/ratio_step_to_reasoning": 1.3842673056552546,
"adv/std_final_conf": 0.8596050143241882,
"adv/std_reasoning": 0.7928605079650879,
"adv/std_step_conf": 0.9351630806922913,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5797742721559573,
"calib/avg_num_step_conf": 5.41796875,
"calib/ece": 0.34231075697211155,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7768924302788844,
"calib/gap": 0.10010773374374748,
"calib/mean_conf": 0.8688446215139443,
"calib/mu_c": 0.9139130434782609,
"calib/mu_w": 0.8138053097345134,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.33067729083665337,
"calib/std_conf": 0.25382778724686866,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5865571428571429,
"calib/step_q_c_n": 700.0,
"calib/step_q_gap": 0.059089893948845895,
"calib/step_q_w": 0.527467248908297,
"calib/step_q_w_n": 687.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2045.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 501.4609375,
"completions/mean_terminated_length": 503.427490234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.04095854610204697,
"kl": 0.042919158935546875,
"learning_rate": 3.833333333333334e-06,
"loss": -0.0457,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032502319663763046,
"mask/share_reasoning": 0.850849986076355,
"mask/share_step_conf": 0.11274144798517227,
"num_tokens": 14736341.0,
"reward": 0.8477140665054321,
"reward_std": 0.2483111470937729,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6155894994735718,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7782761454582214,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.6767548322677612,
"adv/mean_abs_reasoning": 0.48050642013549805,
"adv/mean_abs_step_conf": 0.7537362575531006,
"adv/ratio_final_to_reasoning": 1.4084199584199584,
"adv/ratio_step_to_reasoning": 1.5686289006098075,
"adv/std_final_conf": 0.8813891410827637,
"adv/std_reasoning": 0.739285409450531,
"adv/std_step_conf": 0.9344167113304138,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7938900330774512,
"calib/avg_num_step_conf": 5.0234375,
"calib/ece": 0.20101562500000003,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.57421875,
"calib/gap": 0.3144954128440365,
"calib/mean_conf": 0.76609375,
"calib/mu_c": 0.8999999999999999,
"calib/mu_w": 0.5855045871559634,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.19644531250000008,
"calib/std_conf": 0.3045802295142242,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5879384203480589,
"calib/step_q_c_n": 747.0,
"calib/step_q_gap": 0.060153262648615535,
"calib/step_q_w": 0.5277851576994433,
"calib/step_q_w_n": 539.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1651.0,
"completions/max_terminated_length": 1651.0,
"completions/mean_length": 538.26953125,
"completions/mean_terminated_length": 540.3804321289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.0672,
"grad_norm": 0.041442278772592545,
"kl": 0.0406951904296875,
"learning_rate": 3.8055555555555556e-06,
"loss": 0.0049,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03217097744345665,
"mask/share_reasoning": 0.856690526008606,
"mask/share_step_conf": 0.1072322428226471,
"num_tokens": 14982778.0,
"reward": 0.9666212201118469,
"reward_std": 0.17020484805107117,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7759265899658203,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8432533740997314,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.6773377060890198,
"adv/mean_abs_reasoning": 0.43898671865463257,
"adv/mean_abs_step_conf": 0.7621839046478271,
"adv/ratio_final_to_reasoning": 1.5429571722918272,
"adv/ratio_step_to_reasoning": 1.7362345425476664,
"adv/std_final_conf": 0.8537333607673645,
"adv/std_reasoning": 0.7014127969741821,
"adv/std_step_conf": 0.934469997882843,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6689581856839122,
"calib/avg_num_step_conf": 5.28515625,
"calib/ece": 0.21031620553359676,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6245059288537549,
"calib/gap": 0.15607228915662652,
"calib/mean_conf": 0.7967984189723321,
"calib/mu_c": 0.848,
"calib/mu_w": 0.6919277108433735,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16758893280632406,
"calib/std_conf": 0.2851302089308167,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5487165178571428,
"calib/step_q_c_n": 896.0,
"calib/step_q_gap": 0.03657209772585179,
"calib/step_q_w": 0.512144420131291,
"calib/step_q_w_n": 457.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2263.0,
"completions/max_terminated_length": 2263.0,
"completions/mean_length": 500.83984375,
"completions/mean_terminated_length": 500.83984375,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.047779619693756104,
"kl": 0.044464111328125,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0616,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0342421680688858,
"mask/share_reasoning": 0.8493179678916931,
"mask/share_step_conf": 0.11643985658884048,
"num_tokens": 15214769.0,
"reward": 0.939239501953125,
"reward_std": 0.19172075390815735,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7383691072463989,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8112034201622009,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.5757031440734863,
"adv/mean_abs_reasoning": 0.29549628496170044,
"adv/mean_abs_step_conf": 0.766379177570343,
"adv/ratio_final_to_reasoning": 1.9482584836831494,
"adv/ratio_step_to_reasoning": 2.593532360887969,
"adv/std_final_conf": 0.8076551556587219,
"adv/std_reasoning": 0.5959193110466003,
"adv/std_step_conf": 0.9337574243545532,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.620545465892779,
"calib/avg_num_step_conf": 5.0625,
"calib/ece": 0.32832031250000004,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.7890625,
"calib/gap": 0.12905510828184497,
"calib/mean_conf": 0.8714453125000001,
"calib/mu_c": 0.9263945578231293,
"calib/mu_w": 0.7973394495412843,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3127734375000001,
"calib/std_conf": 0.253972848837779,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5844356306892068,
"calib/step_q_c_n": 769.0,
"calib/step_q_gap": 0.04492898932298284,
"calib/step_q_w": 0.539506641366224,
"calib/step_q_w_n": 527.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1034.0,
"completions/max_terminated_length": 1034.0,
"completions/mean_length": 406.51171875,
"completions/mean_terminated_length": 408.10589599609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.058045756071805954,
"kl": 0.051605224609375,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0014,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.039061564952135086,
"mask/share_reasoning": 0.8285530209541321,
"mask/share_step_conf": 0.12847915291786194,
"num_tokens": 15423860.0,
"reward": 0.9013949632644653,
"reward_std": 0.1410367488861084,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.6657683849334717,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8221778273582458,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.6945334672927856,
"adv/mean_abs_reasoning": 0.49474918842315674,
"adv/mean_abs_step_conf": 0.7819595336914062,
"adv/ratio_final_to_reasoning": 1.4038092098874841,
"adv/ratio_step_to_reasoning": 1.5805170619554403,
"adv/std_final_conf": 0.8789015412330627,
"adv/std_reasoning": 0.7575258612632751,
"adv/std_step_conf": 0.9340898394584656,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7342580645161291,
"calib/avg_num_step_conf": 5.5625,
"calib/ece": 0.22863453815261048,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5180722891566265,
"calib/gap": 0.2801380645161292,
"calib/mean_conf": 0.7266265060240964,
"calib/mu_c": 0.8672580645161292,
"calib/mu_w": 0.58712,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.22863453815261048,
"calib/std_conf": 0.3171102342449586,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5563850556438791,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.10730329463758981,
"calib/step_q_w": 0.44908176100628927,
"calib/step_q_w_n": 795.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 546.6328125,
"completions/mean_terminated_length": 550.93701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.0704,
"grad_norm": 0.04052841290831566,
"kl": 0.0453948974609375,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.0003,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032529592514038086,
"mask/share_reasoning": 0.8484911322593689,
"mask/share_step_conf": 0.1111668050289154,
"num_tokens": 15670150.0,
"reward": 0.9120385646820068,
"reward_std": 0.20076248049736023,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.7170792818069458,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8155916333198547,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.5726084113121033,
"adv/mean_abs_reasoning": 0.33323174715042114,
"adv/mean_abs_step_conf": 0.7718336582183838,
"adv/ratio_final_to_reasoning": 1.7183489154580078,
"adv/ratio_step_to_reasoning": 2.3162068584959203,
"adv/std_final_conf": 0.7796696424484253,
"adv/std_reasoning": 0.6185460686683655,
"adv/std_step_conf": 0.9334813356399536,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.789329290303949,
"calib/avg_num_step_conf": 5.2421875,
"calib/ece": 0.11793650793650802,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5992063492063492,
"calib/gap": 0.3455620532813516,
"calib/mean_conf": 0.7521428571428572,
"calib/mu_c": 0.8632163742690059,
"calib/mu_w": 0.5176543209876543,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.09575396825396834,
"calib/std_conf": 0.3248298874523601,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.565239179954442,
"calib/step_q_c_n": 878.0,
"calib/step_q_gap": 0.08791159374754537,
"calib/step_q_w": 0.4773275862068966,
"calib/step_q_w_n": 464.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2195.0,
"completions/max_terminated_length": 2195.0,
"completions/mean_length": 532.609375,
"completions/mean_terminated_length": 532.609375,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.037965673953294754,
"kl": 0.04518890380859375,
"learning_rate": 3.694444444444445e-06,
"loss": 0.0047,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032018158584833145,
"mask/share_reasoning": 0.859516978263855,
"mask/share_step_conf": 0.10846483707427979,
"num_tokens": 15911506.0,
"reward": 0.996482789516449,
"reward_std": 0.13825537264347076,
"rewards/accuracy_reward_step": 0.66796875,
"rewards/final_brier_reward_step": 0.8088640570640564,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8536326885223389,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.6434445977210999,
"adv/mean_abs_reasoning": 0.508094310760498,
"adv/mean_abs_step_conf": 0.7290732860565186,
"adv/ratio_final_to_reasoning": 1.2663881175091571,
"adv/ratio_step_to_reasoning": 1.4349172400007133,
"adv/std_final_conf": 0.8660622239112854,
"adv/std_reasoning": 0.7752746343612671,
"adv/std_step_conf": 0.9345232844352722,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7749092761189279,
"calib/avg_num_step_conf": 5.0859375,
"calib/ece": 0.24063492063492076,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.6746031746031746,
"calib/gap": 0.34736614248424247,
"calib/mean_conf": 0.7922222222222223,
"calib/mu_c": 0.9479856115107912,
"calib/mu_w": 0.6006194690265487,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.24063492063492076,
"calib/std_conf": 0.3141248586275211,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.54993893129771,
"calib/step_q_c_n": 655.0,
"calib/step_q_gap": 0.1253021461354225,
"calib/step_q_w": 0.4246367851622875,
"calib/step_q_w_n": 647.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2022.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 489.25,
"completions/mean_terminated_length": 489.25,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.03003855049610138,
"kl": 0.049896240234375,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0552,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03539380431175232,
"mask/share_reasoning": 0.846167802810669,
"mask/share_step_conf": 0.11843834817409515,
"num_tokens": 16140842.0,
"reward": 0.9478596448898315,
"reward_std": 0.21063147485256195,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7486066222190857,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8432064056396484,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7356120347976685,
"adv/mean_abs_reasoning": 0.5031900405883789,
"adv/mean_abs_step_conf": 0.7626789808273315,
"adv/ratio_final_to_reasoning": 1.4618970477585747,
"adv/ratio_step_to_reasoning": 1.5156877507661575,
"adv/std_final_conf": 0.9045613408088684,
"adv/std_reasoning": 0.7393735647201538,
"adv/std_step_conf": 0.9342234134674072,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7375190548780488,
"calib/avg_num_step_conf": 5.1484375,
"calib/ece": 0.19768924302788846,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.4342629482071713,
"calib/gap": 0.2881923272357725,
"calib/mean_conf": 0.6307569721115538,
"calib/mu_c": 0.7777235772357725,
"calib/mu_w": 0.48953125,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16920318725099603,
"calib/std_conf": 0.3507042213854929,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5171782178217821,
"calib/step_q_c_n": 606.0,
"calib/step_q_gap": 0.07135841445099556,
"calib/step_q_w": 0.4458198033707865,
"calib/step_q_w_n": 712.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2212.0,
"completions/max_terminated_length": 2212.0,
"completions/mean_length": 581.4296875,
"completions/mean_terminated_length": 583.7098388671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.0736,
"grad_norm": 0.0420890748500824,
"kl": 0.04071044921875,
"learning_rate": 3.638888888888889e-06,
"loss": 0.0318,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03073749504983425,
"mask/share_reasoning": 0.8656871318817139,
"mask/share_step_conf": 0.09966909885406494,
"num_tokens": 16394184.0,
"reward": 0.9349965453147888,
"reward_std": 0.17003074288368225,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7366687059402466,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8411368131637573,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.6535979509353638,
"adv/mean_abs_reasoning": 0.4485911726951599,
"adv/mean_abs_step_conf": 0.78276127576828,
"adv/ratio_final_to_reasoning": 1.4570013649811968,
"adv/ratio_step_to_reasoning": 1.7449324093147178,
"adv/std_final_conf": 0.8590722680091858,
"adv/std_reasoning": 0.7205913066864014,
"adv/std_step_conf": 0.9336462020874023,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7910840932117528,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.14784552845528456,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.4377446808510638,
"calib/mean_conf": 0.6422357723577237,
"calib/mu_c": 0.8290780141843972,
"calib/mu_w": 0.39133333333333337,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.10845528455284556,
"calib/std_conf": 0.3799853994050151,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5202646596858639,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": 0.11241115650115052,
"calib/step_q_w": 0.40785350318471336,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2263.0,
"completions/max_terminated_length": 2263.0,
"completions/mean_length": 564.765625,
"completions/mean_terminated_length": 564.765625,
"completions/min_length": 108.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.04838375374674797,
"kl": 0.043308258056640625,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0411,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03303219750523567,
"mask/share_reasoning": 0.8492827415466309,
"mask/share_step_conf": 0.11768506467342377,
"num_tokens": 16645756.0,
"reward": 0.9579899311065674,
"reward_std": 0.18355971574783325,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7846719026565552,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8297454118728638,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.656448245048523,
"adv/mean_abs_reasoning": 0.5046650171279907,
"adv/mean_abs_step_conf": 0.7693231105804443,
"adv/ratio_final_to_reasoning": 1.30076035145911,
"adv/ratio_step_to_reasoning": 1.5244232995554203,
"adv/std_final_conf": 0.8575155138969421,
"adv/std_reasoning": 0.7753113508224487,
"adv/std_step_conf": 0.9343364238739014,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6714285714285715,
"calib/avg_num_step_conf": 5.66015625,
"calib/ece": 0.2685826771653543,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.5590551181102362,
"calib/gap": 0.20466165413533832,
"calib/mean_conf": 0.6938582677165355,
"calib/mu_c": 0.7857142857142857,
"calib/mu_w": 0.5810526315789474,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20562992125984247,
"calib/std_conf": 0.3592556768226254,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5017489986648865,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.08003214152202931,
"calib/step_q_w": 0.4217168571428572,
"calib/step_q_w_n": 700.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1793.0,
"completions/max_terminated_length": 1793.0,
"completions/mean_length": 526.7578125,
"completions/mean_terminated_length": 528.8235473632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.05430913344025612,
"kl": 0.04532623291015625,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.0496,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03347496688365936,
"mask/share_reasoning": 0.8458138704299927,
"mask/share_step_conf": 0.11680489778518677,
"num_tokens": 16885014.0,
"reward": 0.913619875907898,
"reward_std": 0.18851624429225922,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.69174724817276,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8292425274848938,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.6513075828552246,
"adv/mean_abs_reasoning": 0.4869433641433716,
"adv/mean_abs_step_conf": 0.7313523292541504,
"adv/ratio_final_to_reasoning": 1.337542783853317,
"adv/ratio_step_to_reasoning": 1.5019248296785848,
"adv/std_final_conf": 0.8733137845993042,
"adv/std_reasoning": 0.7574735283851624,
"adv/std_step_conf": 0.9336986541748047,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7550791474510781,
"calib/avg_num_step_conf": 5.390625,
"calib/ece": 0.2041568627450982,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5568627450980392,
"calib/gap": 0.29419543811541826,
"calib/mean_conf": 0.7094901960784314,
"calib/mu_c": 0.8398591549295775,
"calib/mu_w": 0.5456637168141593,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.17839215686274526,
"calib/std_conf": 0.3480626139607879,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.48932885906040263,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.07074618189504822,
"calib/step_q_w": 0.4185826771653544,
"calib/step_q_w_n": 635.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1604.0,
"completions/max_terminated_length": 1604.0,
"completions/mean_length": 494.8828125,
"completions/mean_terminated_length": 494.8828125,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0768,
"grad_norm": 0.04387129843235016,
"kl": 0.0503692626953125,
"learning_rate": 3.555555555555556e-06,
"loss": -0.0452,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.032157592475414276,
"mask/share_reasoning": 0.8548599481582642,
"mask/share_step_conf": 0.11298239976167679,
"num_tokens": 17116112.0,
"reward": 0.9658874273300171,
"reward_std": 0.16145509481430054,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7510405778884888,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8705779314041138,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.6699906587600708,
"adv/mean_abs_reasoning": 0.5513850450515747,
"adv/mean_abs_step_conf": 0.7873334884643555,
"adv/ratio_final_to_reasoning": 1.2151048795627057,
"adv/ratio_step_to_reasoning": 1.4279195555452742,
"adv/std_final_conf": 0.8398651480674744,
"adv/std_reasoning": 0.7753834128379822,
"adv/std_step_conf": 0.9337665438652039,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7150241447323675,
"calib/avg_num_step_conf": 5.3203125,
"calib/ece": 0.1591015625,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.63671875,
"calib/gap": 0.31899952390668573,
"calib/mean_conf": 0.7627734374999999,
"calib/mu_c": 0.8711834319526628,
"calib/mu_w": 0.5521839080459771,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.130859375,
"calib/std_conf": 0.3359183474736585,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5108766627771295,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": 0.08652022713356511,
"calib/step_q_w": 0.4243564356435644,
"calib/step_q_w_n": 505.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1508.0,
"completions/max_terminated_length": 1508.0,
"completions/mean_length": 474.89453125,
"completions/mean_terminated_length": 476.75689697265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.042493823915719986,
"kl": 0.0510711669921875,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.0119,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.033234454691410065,
"mask/share_reasoning": 0.8480768203735352,
"mask/share_step_conf": 0.11478252708911896,
"num_tokens": 17344717.0,
"reward": 0.9947052001953125,
"reward_std": 0.1547623872756958,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.7954136729240417,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.861965537071228,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.6531205177307129,
"adv/mean_abs_reasoning": 0.48068854212760925,
"adv/mean_abs_step_conf": 0.7459403276443481,
"adv/ratio_final_to_reasoning": 1.3587187138680077,
"adv/ratio_step_to_reasoning": 1.5518163265192246,
"adv/std_final_conf": 0.8750687837600708,
"adv/std_reasoning": 0.7392587065696716,
"adv/std_step_conf": 0.9338178038597107,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.7663891132790654,
"calib/avg_num_step_conf": 5.4296875,
"calib/ece": 0.16885826771653542,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.48031496062992124,
"calib/gap": 0.36025166221338456,
"calib/mean_conf": 0.6433464566929135,
"calib/mu_c": 0.8149624060150376,
"calib/mu_w": 0.454710743801653,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14429133858267715,
"calib/std_conf": 0.3650221410165784,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.47199115044247786,
"calib/step_q_c_n": 678.0,
"calib/step_q_gap": 0.09902766729641044,
"calib/step_q_w": 0.3729634831460674,
"calib/step_q_w_n": 712.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1632.0,
"completions/max_terminated_length": 1632.0,
"completions/mean_length": 498.171875,
"completions/mean_terminated_length": 498.171875,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.04365735128521919,
"kl": 0.05310821533203125,
"learning_rate": 3.5e-06,
"loss": 0.0409,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03375405818223953,
"mask/share_reasoning": 0.8440734148025513,
"mask/share_step_conf": 0.12217249721288681,
"num_tokens": 17576177.0,
"reward": 0.9710075259208679,
"reward_std": 0.15870296955108643,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7765917778015137,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8630794286727905,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.5426826477050781,
"adv/mean_abs_reasoning": 0.3250294327735901,
"adv/mean_abs_step_conf": 0.744985818862915,
"adv/ratio_final_to_reasoning": 1.6696415554559993,
"adv/ratio_step_to_reasoning": 2.2920564839488224,
"adv/std_final_conf": 0.7762205600738525,
"adv/std_reasoning": 0.6184999346733093,
"adv/std_step_conf": 0.9328890442848206,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.8371071428571429,
"calib/avg_num_step_conf": 5.3203125,
"calib/ece": 0.12968627450980397,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7490196078431373,
"calib/gap": 0.4910464285714284,
"calib/mean_conf": 0.8121176470588235,
"calib/mu_c": 0.9661714285714285,
"calib/mu_w": 0.4751250000000001,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.127764705882353,
"calib/std_conf": 0.32543218004098545,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5055732758620689,
"calib/step_q_c_n": 928.0,
"calib/step_q_gap": 0.10974378277451124,
"calib/step_q_w": 0.39582949308755766,
"calib/step_q_w_n": 434.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1771.0,
"completions/max_terminated_length": 1771.0,
"completions/mean_length": 473.49609375,
"completions/mean_terminated_length": 473.49609375,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.08,
"grad_norm": 0.06338401883840561,
"kl": 0.07183837890625,
"learning_rate": 3.4722222222222224e-06,
"loss": 0.0384,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035120923072099686,
"mask/share_reasoning": 0.8420246839523315,
"mask/share_step_conf": 0.12285438925027847,
"num_tokens": 17802144.0,
"reward": 1.0387458801269531,
"reward_std": 0.1380448341369629,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.87017422914505,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8721611499786377,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.6495287418365479,
"adv/mean_abs_reasoning": 0.4122047424316406,
"adv/mean_abs_step_conf": 0.7574204802513123,
"adv/ratio_final_to_reasoning": 1.5757430106332768,
"adv/ratio_step_to_reasoning": 1.8374860895175695,
"adv/std_final_conf": 0.848217785358429,
"adv/std_reasoning": 0.6815527081489563,
"adv/std_step_conf": 0.9339469075202942,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7127142857142856,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.1978039215686274,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.5843137254901961,
"calib/gap": 0.2991428571428571,
"calib/mean_conf": 0.703294117647059,
"calib/mu_c": 0.7971428571428572,
"calib/mu_w": 0.49800000000000005,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1074117647058823,
"calib/std_conf": 0.37153435405074947,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.48410377358490564,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.10021763497104424,
"calib/step_q_w": 0.3838861386138614,
"calib/step_q_w_n": 404.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1969.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 494.2109375,
"completions/mean_terminated_length": 496.1490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.11171876639127731,
"kl": 0.0479583740234375,
"learning_rate": 3.444444444444445e-06,
"loss": 0.0354,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03396555408835411,
"mask/share_reasoning": 0.8541315793991089,
"mask/share_step_conf": 0.10799665749073029,
"num_tokens": 18031718.0,
"reward": 0.987180769443512,
"reward_std": 0.15297412872314453,
"rewards/accuracy_reward_step": 0.68359375,
"rewards/final_brier_reward_step": 0.7706863284111023,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8685189485549927,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.6657007932662964,
"adv/mean_abs_reasoning": 0.45153507590293884,
"adv/mean_abs_step_conf": 0.7501556873321533,
"adv/ratio_final_to_reasoning": 1.4743058264855469,
"adv/ratio_step_to_reasoning": 1.6613453247946688,
"adv/std_final_conf": 0.8381210565567017,
"adv/std_reasoning": 0.7014268636703491,
"adv/std_step_conf": 0.9340924620628357,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6687104430379747,
"calib/avg_num_step_conf": 5.26953125,
"calib/ece": 0.2486614173228348,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.6653543307086615,
"calib/gap": 0.1927254746835444,
"calib/mean_conf": 0.7751968503937008,
"calib/mu_c": 0.8480379746835444,
"calib/mu_w": 0.6553125,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.20090551181102378,
"calib/std_conf": 0.3336150131483204,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4864655172413793,
"calib/step_q_c_n": 812.0,
"calib/step_q_gap": 0.07531095485776662,
"calib/step_q_w": 0.4111545623836127,
"calib/step_q_w_n": 537.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2197.0,
"completions/max_terminated_length": 2197.0,
"completions/mean_length": 507.390625,
"completions/mean_terminated_length": 507.390625,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.04007513448596001,
"kl": 0.04627227783203125,
"learning_rate": 3.416666666666667e-06,
"loss": 0.0272,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03483344614505768,
"mask/share_reasoning": 0.8469037413597107,
"mask/share_step_conf": 0.11826279759407043,
"num_tokens": 18266274.0,
"reward": 0.9439308047294617,
"reward_std": 0.17922864854335785,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7151319980621338,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8508545160293579,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.6560389995574951,
"adv/mean_abs_reasoning": 0.45338594913482666,
"adv/mean_abs_step_conf": 0.7529264688491821,
"adv/ratio_final_to_reasoning": 1.4469769096492315,
"adv/ratio_step_to_reasoning": 1.660674465730474,
"adv/std_final_conf": 0.8485627174377441,
"adv/std_reasoning": 0.7205855250358582,
"adv/std_step_conf": 0.9336483478546143,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6738016584642352,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.23909448818897638,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.7362204724409449,
"calib/gap": 0.1971381379356838,
"calib/mean_conf": 0.8309055118110237,
"calib/mu_c": 0.9015337423312882,
"calib/mu_w": 0.7043956043956044,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21413385826771655,
"calib/std_conf": 0.29396215228749084,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5272615039281705,
"calib/step_q_c_n": 891.0,
"calib/step_q_gap": 0.06413516559840599,
"calib/step_q_w": 0.4631263383297645,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2029.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 541.21875,
"completions/mean_terminated_length": 541.21875,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.0832,
"grad_norm": 0.045858126133680344,
"kl": 0.04248809814453125,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.0349,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.02998296171426773,
"mask/share_reasoning": 0.8644160032272339,
"mask/share_step_conf": 0.10560107231140137,
"num_tokens": 18512850.0,
"reward": 0.9525356292724609,
"reward_std": 0.19739927351474762,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7249546647071838,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8558976650238037,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.5855348110198975,
"adv/mean_abs_reasoning": 0.39158886671066284,
"adv/mean_abs_step_conf": 0.7473605871200562,
"adv/ratio_final_to_reasoning": 1.4952795158309171,
"adv/ratio_step_to_reasoning": 1.9085337982100137,
"adv/std_final_conf": 0.796627402305603,
"adv/std_reasoning": 0.6815370917320251,
"adv/std_step_conf": 0.933944821357727,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6948180379746836,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.26480314960629925,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.7598425196850394,
"calib/gap": 0.2132502637130802,
"calib/mean_conf": 0.842755905511811,
"calib/mu_c": 0.9233544303797467,
"calib/mu_w": 0.7101041666666665,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.24275590551181106,
"calib/std_conf": 0.2932969717645023,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5471011025358324,
"calib/step_q_c_n": 907.0,
"calib/step_q_gap": 0.05951825653193382,
"calib/step_q_w": 0.4875828460038986,
"calib/step_q_w_n": 513.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2142.0,
"completions/max_terminated_length": 2142.0,
"completions/mean_length": 539.546875,
"completions/mean_terminated_length": 539.546875,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.7947566509246826,
"kl": 0.7817840576171875,
"learning_rate": 3.3611111111111117e-06,
"loss": 0.0028,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.031161334365606308,
"mask/share_reasoning": 0.8599764108657837,
"mask/share_step_conf": 0.10886222869157791,
"num_tokens": 18757350.0,
"reward": 0.9420266151428223,
"reward_std": 0.17382916808128357,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7247257828712463,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8374522924423218,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.6251348853111267,
"adv/mean_abs_reasoning": 0.5489984750747681,
"adv/mean_abs_step_conf": 0.7228517532348633,
"adv/ratio_final_to_reasoning": 1.1386823710684981,
"adv/ratio_step_to_reasoning": 1.316673517419913,
"adv/std_final_conf": 0.8623039126396179,
"adv/std_reasoning": 0.8097235560417175,
"adv/std_step_conf": 0.9342271685600281,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6589888405008165,
"calib/avg_num_step_conf": 5.64453125,
"calib/ece": 0.29584313725490197,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 0.9098039215686274,
"calib/gap": 0.12011771910724,
"calib/mean_conf": 0.9419607843137255,
"calib/mu_c": 0.9834131736526945,
"calib/mu_w": 0.8632954545454545,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2914509803921569,
"calib/std_conf": 0.18354269266510556,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.585968253968254,
"calib/step_q_c_n": 945.0,
"calib/step_q_gap": 0.0781834539682541,
"calib/step_q_w": 0.5077847999999999,
"calib/step_q_w_n": 500.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1930.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 490.2109375,
"completions/mean_terminated_length": 492.13336181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.047408103942871094,
"kl": 0.050380706787109375,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0121,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.032809220254421234,
"mask/share_reasoning": 0.8404864072799683,
"mask/share_step_conf": 0.12279807031154633,
"num_tokens": 18985004.0,
"reward": 0.9354228973388672,
"reward_std": 0.2329998016357422,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.705510139465332,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8364294767379761,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.5626240968704224,
"adv/mean_abs_reasoning": 0.41431060433387756,
"adv/mean_abs_step_conf": 0.7470904588699341,
"adv/ratio_final_to_reasoning": 1.357976578405472,
"adv/ratio_step_to_reasoning": 1.803213461241464,
"adv/std_final_conf": 0.7812780141830444,
"adv/std_reasoning": 0.7013968825340271,
"adv/std_step_conf": 0.9344127774238586,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6627988748241913,
"calib/avg_num_step_conf": 5.5390625,
"calib/ece": 0.2665274193548387,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8225806451612904,
"calib/gap": 0.2256700421940927,
"calib/mean_conf": 0.8794403225806452,
"calib/mu_c": 0.9613367088607595,
"calib/mu_w": 0.7356666666666668,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25443548387096776,
"calib/std_conf": 0.26357985076429535,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6292937853107344,
"calib/step_q_c_n": 708.0,
"calib/step_q_gap": 0.18060364446566401,
"calib/step_q_w": 0.4486901408450704,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2946.0,
"completions/max_terminated_length": 2946.0,
"completions/mean_length": 529.61328125,
"completions/mean_terminated_length": 533.783447265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.0864,
"grad_norm": 0.030594119802117348,
"kl": 0.040287017822265625,
"learning_rate": 3.3055555555555558e-06,
"loss": 0.0861,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03291993588209152,
"mask/share_reasoning": 0.8520771265029907,
"mask/share_step_conf": 0.10719040036201477,
"num_tokens": 19226833.0,
"reward": 0.919786274433136,
"reward_std": 0.21998167037963867,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.7177573442459106,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8054088354110718,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.6789511442184448,
"adv/mean_abs_reasoning": 0.5512694120407104,
"adv/mean_abs_step_conf": 0.7774230241775513,
"adv/ratio_final_to_reasoning": 1.2316140336991983,
"adv/ratio_step_to_reasoning": 1.4102415392496686,
"adv/std_final_conf": 0.8529064059257507,
"adv/std_reasoning": 0.792765200138092,
"adv/std_step_conf": 0.9345331788063049,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6438387096774194,
"calib/avg_num_step_conf": 4.5390625,
"calib/ece": 0.31152941176470605,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.8235294117647058,
"calib/gap": 0.1329129032258064,
"calib/mean_conf": 0.9054901960784314,
"calib/mu_c": 0.9576129032258065,
"calib/mu_w": 0.8247000000000001,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3045882352941178,
"calib/std_conf": 0.23330795709760863,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6615191740412979,
"calib/step_q_c_n": 678.0,
"calib/step_q_gap": 0.13408115751237237,
"calib/step_q_w": 0.5274380165289255,
"calib/step_q_w_n": 484.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1776.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 452.515625,
"completions/mean_terminated_length": 452.515625,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.04082036763429642,
"kl": 0.054443359375,
"learning_rate": 3.277777777777778e-06,
"loss": 0.054,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.036828212440013885,
"mask/share_reasoning": 0.8555018901824951,
"mask/share_step_conf": 0.1076698899269104,
"num_tokens": 19448229.0,
"reward": 0.9207143783569336,
"reward_std": 0.23406952619552612,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.6793046593666077,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8418115377426147,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.5660687685012817,
"adv/mean_abs_reasoning": 0.35447534918785095,
"adv/mean_abs_step_conf": 0.7589709162712097,
"adv/ratio_final_to_reasoning": 1.5969199827243807,
"adv/ratio_step_to_reasoning": 2.141110568083537,
"adv/std_final_conf": 0.7831708192825317,
"adv/std_reasoning": 0.6612017154693604,
"adv/std_step_conf": 0.934005618095398,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7029090430433066,
"calib/avg_num_step_conf": 4.85546875,
"calib/ece": 0.3333734939759037,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8152610441767069,
"calib/gap": 0.17196261682242975,
"calib/mean_conf": 0.8761044176706827,
"calib/mu_c": 0.9499999999999998,
"calib/mu_w": 0.7780373831775701,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3195983935742973,
"calib/std_conf": 0.2749218310599134,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.631921921921922,
"calib/step_q_c_n": 666.0,
"calib/step_q_gap": 0.12538448691325654,
"calib/step_q_w": 0.5065374350086654,
"calib/step_q_w_n": 577.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2423.0,
"completions/max_terminated_length": 2423.0,
"completions/mean_length": 566.58203125,
"completions/mean_terminated_length": 566.58203125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.049747027456760406,
"kl": 0.040142059326171875,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0908,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.031713858246803284,
"mask/share_reasoning": 0.8682199716567993,
"mask/share_step_conf": 0.10006619244813919,
"num_tokens": 19700538.0,
"reward": 0.8887869119644165,
"reward_std": 0.1813461184501648,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6517887115478516,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8203163743019104,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.645351767539978,
"adv/mean_abs_reasoning": 0.5118788480758667,
"adv/mean_abs_step_conf": 0.7618239521980286,
"adv/ratio_final_to_reasoning": 1.2607509959941323,
"adv/ratio_step_to_reasoning": 1.48828957293644,
"adv/std_final_conf": 0.8241501450538635,
"adv/std_reasoning": 0.7575427293777466,
"adv/std_step_conf": 0.9342594146728516,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7364296636085628,
"calib/avg_num_step_conf": 4.69140625,
"calib/ece": 0.30798418972332026,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7628458498023716,
"calib/gap": 0.20664946483180446,
"calib/mean_conf": 0.8382608695652174,
"calib/mu_c": 0.9272916666666668,
"calib/mu_w": 0.7206422018348624,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.28853754940711474,
"calib/std_conf": 0.3089945110133803,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.6382036775106082,
"calib/step_q_c_n": 707.0,
"calib/step_q_gap": 0.1228996289276123,
"calib/step_q_w": 0.5153040485829959,
"calib/step_q_w_n": 494.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2183.0,
"completions/max_terminated_length": 2183.0,
"completions/mean_length": 471.50390625,
"completions/mean_terminated_length": 473.35296630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.0896,
"grad_norm": 0.04613294452428818,
"kl": 0.047542572021484375,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0052,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03553298860788345,
"mask/share_reasoning": 0.850753128528595,
"mask/share_step_conf": 0.10980760306119919,
"num_tokens": 19927163.0,
"reward": 0.9026806950569153,
"reward_std": 0.21408578753471375,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6801788806915283,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8150261044502258,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.6471362709999084,
"adv/mean_abs_reasoning": 0.4608671963214874,
"adv/mean_abs_step_conf": 0.7554687261581421,
"adv/ratio_final_to_reasoning": 1.4041708244048794,
"adv/ratio_step_to_reasoning": 1.639233020245488,
"adv/std_final_conf": 0.8407363891601562,
"adv/std_reasoning": 0.7206194400787354,
"adv/std_step_conf": 0.9350539445877075,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7098214285714285,
"calib/avg_num_step_conf": 4.59765625,
"calib/ece": 0.27793522267206483,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.7246963562753036,
"calib/gap": 0.2943055555555555,
"calib/mean_conf": 0.8121052631578948,
"calib/mu_c": 0.9455555555555555,
"calib/mu_w": 0.65125,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.27174089068825913,
"calib/std_conf": 0.32700026162971557,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6017341977309562,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.08634134058809906,
"calib/step_q_w": 0.5153928571428571,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1948.0,
"completions/max_terminated_length": 1948.0,
"completions/mean_length": 508.65625,
"completions/mean_terminated_length": 512.6614379882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.045355018228292465,
"kl": 0.045810699462890625,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.0691,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.033476561307907104,
"mask/share_reasoning": 0.8550655841827393,
"mask/share_step_conf": 0.10364531725645065,
"num_tokens": 20165203.0,
"reward": 0.8950651288032532,
"reward_std": 0.22769135236740112,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6913609504699707,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8011130094528198,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.6400004625320435,
"adv/mean_abs_reasoning": 0.39856961369514465,
"adv/mean_abs_step_conf": 0.7425464391708374,
"adv/ratio_final_to_reasoning": 1.6057432391761879,
"adv/ratio_step_to_reasoning": 1.8630282230666775,
"adv/std_final_conf": 0.8474387526512146,
"adv/std_reasoning": 0.7012442946434021,
"adv/std_step_conf": 0.9346654415130615,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7519546027742748,
"calib/avg_num_step_conf": 4.81640625,
"calib/ece": 0.2580158730158731,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.6388888888888888,
"calib/gap": 0.32935687263556124,
"calib/mean_conf": 0.7628571428571429,
"calib/mu_c": 0.9223076923076924,
"calib/mu_w": 0.5929508196721311,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.25250000000000006,
"calib/std_conf": 0.341313655363902,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5728436018957346,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.13711026856240127,
"calib/step_q_w": 0.4357333333333333,
"calib/step_q_w_n": 600.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2174.0,
"completions/max_terminated_length": 2174.0,
"completions/mean_length": 499.2265625,
"completions/mean_terminated_length": 501.1843566894531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.06268741190433502,
"kl": 0.04718017578125,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.0189,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03539302200078964,
"mask/share_reasoning": 0.8536935448646545,
"mask/share_step_conf": 0.10700717568397522,
"num_tokens": 20398517.0,
"reward": 0.9265288710594177,
"reward_std": 0.18060728907585144,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7257484197616577,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8288718461990356,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.6161288022994995,
"adv/mean_abs_reasoning": 0.424224853515625,
"adv/mean_abs_step_conf": 0.7731361389160156,
"adv/ratio_final_to_reasoning": 1.4523637575534134,
"adv/ratio_step_to_reasoning": 1.8224678080713619,
"adv/std_final_conf": 0.8241699934005737,
"adv/std_reasoning": 0.7013879418373108,
"adv/std_step_conf": 0.9341001510620117,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6289682539682541,
"calib/avg_num_step_conf": 4.453125,
"calib/ece": 0.201771653543307,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7322834645669292,
"calib/gap": 0.19870115995115967,
"calib/mean_conf": 0.8480708661417322,
"calib/mu_c": 0.9043956043956042,
"calib/mu_w": 0.7056944444444445,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.16665354330708657,
"calib/std_conf": 0.28637535747981313,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5853639846743296,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.1364844328536013,
"calib/step_q_w": 0.4488795518207283,
"calib/step_q_w_n": 357.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2980.0,
"completions/max_terminated_length": 2980.0,
"completions/mean_length": 442.0234375,
"completions/mean_terminated_length": 442.0234375,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.0928,
"grad_norm": 0.03550032898783684,
"kl": 0.049495697021484375,
"learning_rate": 3.138888888888889e-06,
"loss": 0.1331,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03907443583011627,
"mask/share_reasoning": 0.8512502312660217,
"mask/share_step_conf": 0.1096753180027008,
"num_tokens": 20617171.0,
"reward": 0.9840619564056396,
"reward_std": 0.17071446776390076,
"rewards/accuracy_reward_step": 0.7109375,
"rewards/final_brier_reward_step": 0.7719812393188477,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8562988638877869,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.6632359623908997,
"adv/mean_abs_reasoning": 0.46517929434776306,
"adv/mean_abs_step_conf": 0.7715968489646912,
"adv/ratio_final_to_reasoning": 1.4257641525529112,
"adv/ratio_step_to_reasoning": 1.6587084987231904,
"adv/std_final_conf": 0.8690577149391174,
"adv/std_reasoning": 0.7205847501754761,
"adv/std_step_conf": 0.9343187212944031,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7721543659043659,
"calib/avg_num_step_conf": 4.65625,
"calib/ece": 0.1628571428571429,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5793650793650794,
"calib/gap": 0.38315228690228686,
"calib/mean_conf": 0.718968253968254,
"calib/mu_c": 0.8770945945945946,
"calib/mu_w": 0.49394230769230774,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.14726190476190482,
"calib/std_conf": 0.3641522648674075,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5372624113475177,
"calib/step_q_c_n": 705.0,
"calib/step_q_gap": 0.14943900272328775,
"calib/step_q_w": 0.38782340862423,
"calib/step_q_w_n": 487.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1888.0,
"completions/max_terminated_length": 1888.0,
"completions/mean_length": 481.125,
"completions/mean_terminated_length": 483.01177978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.03591761738061905,
"kl": 0.05088043212890625,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0273,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.033284686505794525,
"mask/share_reasoning": 0.8623548746109009,
"mask/share_step_conf": 0.10045421123504639,
"num_tokens": 20850187.0,
"reward": 0.9585222005844116,
"reward_std": 0.19591569900512695,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7771499752998352,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8273943662643433,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.697031557559967,
"adv/mean_abs_reasoning": 0.4544700086116791,
"adv/mean_abs_step_conf": 0.7597097754478455,
"adv/ratio_final_to_reasoning": 1.5337239957577578,
"adv/ratio_step_to_reasoning": 1.6716389663833193,
"adv/std_final_conf": 0.8651239275932312,
"adv/std_reasoning": 0.7013769745826721,
"adv/std_step_conf": 0.9346089959144592,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.8085505403687223,
"calib/avg_num_step_conf": 4.703125,
"calib/ece": 0.13569721115537847,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.398406374501992,
"calib/gap": 0.41586967577876677,
"calib/mean_conf": 0.5728286852589641,
"calib/mu_c": 0.7733076923076924,
"calib/mu_w": 0.3574380165289256,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.0952988047808765,
"calib/std_conf": 0.3946156112109845,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5142784380305603,
"calib/step_q_c_n": 589.0,
"calib/step_q_gap": 0.15806705591673914,
"calib/step_q_w": 0.35621138211382114,
"calib/step_q_w_n": 615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2044.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 526.33203125,
"completions/mean_terminated_length": 526.33203125,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.03845527768135071,
"kl": 0.0560150146484375,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.0427,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03411491960287094,
"mask/share_reasoning": 0.8635823726654053,
"mask/share_step_conf": 0.1023026555776596,
"num_tokens": 21093816.0,
"reward": 0.9638468027114868,
"reward_std": 0.18518471717834473,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7836429476737976,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.846394419670105,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.7023544311523438,
"adv/mean_abs_reasoning": 0.39130640029907227,
"adv/mean_abs_step_conf": 0.7519776821136475,
"adv/ratio_final_to_reasoning": 1.7948963538943907,
"adv/ratio_step_to_reasoning": 1.9217106634057535,
"adv/std_final_conf": 0.8781015872955322,
"adv/std_reasoning": 0.6613117456436157,
"adv/std_step_conf": 0.9345213174819946,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7159850034083163,
"calib/avg_num_step_conf": 5.1484375,
"calib/ece": 0.21081686429512514,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5019762845849802,
"calib/gap": 0.31590456714383097,
"calib/mean_conf": 0.6464163372859025,
"calib/mu_c": 0.7587934560327199,
"calib/mu_w": 0.4428888888888889,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.10648221343873517,
"calib/std_conf": 0.39126191196681814,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4267635933806147,
"calib/step_q_c_n": 846.0,
"calib/step_q_gap": 0.05034410185519095,
"calib/step_q_w": 0.37641949152542376,
"calib/step_q_w_n": 472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2211.0,
"completions/max_terminated_length": 2211.0,
"completions/mean_length": 491.67578125,
"completions/mean_terminated_length": 491.67578125,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.096,
"grad_norm": 0.04405174404382706,
"kl": 0.0619354248046875,
"learning_rate": 3.055555555555556e-06,
"loss": 0.016,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03433162346482277,
"mask/share_reasoning": 0.8532933592796326,
"mask/share_step_conf": 0.11237501353025436,
"num_tokens": 21323005.0,
"reward": 0.9484930634498596,
"reward_std": 0.16438844799995422,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7458431720733643,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8277053833007812,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.7540473937988281,
"adv/mean_abs_reasoning": 0.4702809453010559,
"adv/mean_abs_step_conf": 0.7511721849441528,
"adv/ratio_final_to_reasoning": 1.6033977164779998,
"adv/ratio_step_to_reasoning": 1.5972839053967647,
"adv/std_final_conf": 0.926612377166748,
"adv/std_reasoning": 0.7392401695251465,
"adv/std_step_conf": 0.9344815015792847,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7784903280067283,
"calib/avg_num_step_conf": 4.0859375,
"calib/ece": 0.15047808764940235,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.36254980079681276,
"calib/gap": 0.38301233529576684,
"calib/mean_conf": 0.5956573705179283,
"calib/mu_c": 0.7284146341463416,
"calib/mu_w": 0.3454022988505747,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.04637450199203186,
"calib/std_conf": 0.371918502692295,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4663047001620746,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.1303373341947086,
"calib/step_q_w": 0.335967365967366,
"calib/step_q_w_n": 429.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2184.0,
"completions/max_terminated_length": 2184.0,
"completions/mean_length": 485.08203125,
"completions/mean_terminated_length": 485.08203125,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.09253852069377899,
"kl": 0.06554412841796875,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.0087,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03286009281873703,
"mask/share_reasoning": 0.8777602910995483,
"mask/share_step_conf": 0.08937962353229523,
"num_tokens": 21554898.0,
"reward": 0.9783428907394409,
"reward_std": 0.17696067690849304,
"rewards/accuracy_reward_step": 0.640625,
"rewards/final_brier_reward_step": 0.7896254062652588,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8428416848182678,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.6797538995742798,
"adv/mean_abs_reasoning": 0.4213108718395233,
"adv/mean_abs_step_conf": 0.7387102246284485,
"adv/ratio_final_to_reasoning": 1.6134259640781285,
"adv/ratio_step_to_reasoning": 1.7533614107870021,
"adv/std_final_conf": 0.8790198564529419,
"adv/std_reasoning": 0.7013704180717468,
"adv/std_step_conf": 0.9345918297767639,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7346223201763173,
"calib/avg_num_step_conf": 4.390625,
"calib/ece": 0.21255905511811024,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.48031496062992124,
"calib/gap": 0.3103419488412476,
"calib/mean_conf": 0.6553149606299212,
"calib/mu_c": 0.7689440993788821,
"calib/mu_w": 0.4586021505376345,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11700787401574807,
"calib/std_conf": 0.39278595482986994,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.45211656441717785,
"calib/step_q_c_n": 652.0,
"calib/step_q_gap": 0.11465893729853377,
"calib/step_q_w": 0.3374576271186441,
"calib/step_q_w_n": 472.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1998.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 456.37109375,
"completions/mean_terminated_length": 456.37109375,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.040576785802841187,
"kl": 0.06185150146484375,
"learning_rate": 3e-06,
"loss": 0.0198,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03687359765172005,
"mask/share_reasoning": 0.8611937761306763,
"mask/share_step_conf": 0.1019326001405716,
"num_tokens": 21778449.0,
"reward": 0.9587187767028809,
"reward_std": 0.1689954549074173,
"rewards/accuracy_reward_step": 0.62890625,
"rewards/final_brier_reward_step": 0.7444359064102173,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8503453135490417,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.7539007067680359,
"adv/mean_abs_reasoning": 0.5408545732498169,
"adv/mean_abs_step_conf": 0.7772372961044312,
"adv/ratio_final_to_reasoning": 1.3939065028850455,
"adv/ratio_step_to_reasoning": 1.4370541260920997,
"adv/std_final_conf": 0.906922459602356,
"adv/std_reasoning": 0.8098655939102173,
"adv/std_step_conf": 0.9350221753120422,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6648159164518476,
"calib/avg_num_step_conf": 4.890625,
"calib/ece": 0.2301626016260163,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.35772357723577236,
"calib/gap": 0.23170731707317072,
"calib/mean_conf": 0.5529268292682926,
"calib/mu_c": 0.6687804878048781,
"calib/mu_w": 0.43707317073170737,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1415447154471545,
"calib/std_conf": 0.38613728488210686,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.4331559633027523,
"calib/step_q_c_n": 545.0,
"calib/step_q_gap": 0.09087873557998,
"calib/step_q_w": 0.3422772277227723,
"calib/step_q_w_n": 707.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2665.0,
"completions/max_terminated_length": 2665.0,
"completions/mean_length": 507.9765625,
"completions/mean_terminated_length": 507.9765625,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.0992,
"grad_norm": 0.035629406571388245,
"kl": 0.06622314453125,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.0348,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03448627144098282,
"mask/share_reasoning": 0.8541302680969238,
"mask/share_step_conf": 0.11138348281383514,
"num_tokens": 22014267.0,
"reward": 0.8872026205062866,
"reward_std": 0.2100294828414917,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6782886981964111,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8093979358673096,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.7159183025360107,
"adv/mean_abs_reasoning": 0.5076972842216492,
"adv/mean_abs_step_conf": 0.7585917115211487,
"adv/ratio_final_to_reasoning": 1.4101282886190445,
"adv/ratio_step_to_reasoning": 1.4941811490761583,
"adv/std_final_conf": 0.9018604755401611,
"adv/std_reasoning": 0.7752719521522522,
"adv/std_step_conf": 0.9340033531188965,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7935536119209587,
"calib/avg_num_step_conf": 4.51953125,
"calib/ece": 0.18234126984126986,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.3253968253968254,
"calib/gap": 0.39174149659863944,
"calib/mean_conf": 0.4933730158730159,
"calib/mu_c": 0.6565986394557823,
"calib/mu_w": 0.2648571428571429,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.0461904761904762,
"calib/std_conf": 0.39308321817996,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4559424012158054,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.16514079800939258,
"calib/step_q_w": 0.29080160320641285,
"calib/step_q_w_n": 499.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2215.0,
"completions/max_terminated_length": 2215.0,
"completions/mean_length": 464.46484375,
"completions/mean_terminated_length": 466.28631591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.04146898537874222,
"kl": 0.07958221435546875,
"learning_rate": 2.944444444444445e-06,
"loss": -0.0142,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03604136407375336,
"mask/share_reasoning": 0.8550038933753967,
"mask/share_step_conf": 0.10504850745201111,
"num_tokens": 22241850.0,
"reward": 0.96682208776474,
"reward_std": 0.14535515010356903,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7686004042625427,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8541063070297241,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.6310229301452637,
"adv/mean_abs_reasoning": 0.401436448097229,
"adv/mean_abs_step_conf": 0.7615891695022583,
"adv/ratio_final_to_reasoning": 1.5719123989270356,
"adv/ratio_step_to_reasoning": 1.8971599940964985,
"adv/std_final_conf": 0.8184422254562378,
"adv/std_reasoning": 0.6817244291305542,
"adv/std_step_conf": 0.9347649812698364,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7642728256946951,
"calib/avg_num_step_conf": 4.95703125,
"calib/ece": 0.20983870967741944,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.49193548387096775,
"calib/gap": 0.349446409238542,
"calib/mean_conf": 0.6425,
"calib/mu_c": 0.7622699386503068,
"calib/mu_w": 0.4128235294117648,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.09754032258064524,
"calib/std_conf": 0.3960309636803016,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.455609756097561,
"calib/step_q_c_n": 820.0,
"calib/step_q_gap": 0.12491933293497748,
"calib/step_q_w": 0.3306904231625835,
"calib/step_q_w_n": 449.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2478.0,
"completions/max_terminated_length": 2478.0,
"completions/mean_length": 499.47265625,
"completions/mean_terminated_length": 503.405517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.03742410987615585,
"kl": 0.05938720703125,
"learning_rate": 2.916666666666667e-06,
"loss": 0.011,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03435906395316124,
"mask/share_reasoning": 0.8466284871101379,
"mask/share_step_conf": 0.11119996011257172,
"num_tokens": 22475843.0,
"reward": 0.952112078666687,
"reward_std": 0.18023662269115448,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.750076949596405,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8346158862113953,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.6180611848831177,
"adv/mean_abs_reasoning": 0.3653485178947449,
"adv/mean_abs_step_conf": 0.7559719681739807,
"adv/ratio_final_to_reasoning": 1.6917030030519464,
"adv/ratio_step_to_reasoning": 2.0691803336992667,
"adv/std_final_conf": 0.8387512564659119,
"adv/std_reasoning": 0.6612295508384705,
"adv/std_step_conf": 0.9340872168540955,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.8002291543465405,
"calib/avg_num_step_conf": 4.27734375,
"calib/ece": 0.1503543307086615,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5669291338582677,
"calib/gap": 0.41226197516262564,
"calib/mean_conf": 0.6899606299212598,
"calib/mu_c": 0.813314606741573,
"calib/mu_w": 0.4010526315789473,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.06976377952755913,
"calib/std_conf": 0.3873303520247324,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.4998544793087767,
"calib/step_q_c_n": 733.0,
"calib/step_q_gap": 0.141871053894412,
"calib/step_q_w": 0.3579834254143647,
"calib/step_q_w_n": 362.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1193.0,
"completions/max_terminated_length": 1193.0,
"completions/mean_length": 422.55859375,
"completions/mean_terminated_length": 422.55859375,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.1024,
"grad_norm": 0.06945142149925232,
"kl": 0.07413482666015625,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0411,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03754666820168495,
"mask/share_reasoning": 0.858115553855896,
"mask/share_step_conf": 0.10433775186538696,
"num_tokens": 22689834.0,
"reward": 0.9810190200805664,
"reward_std": 0.16697180271148682,
"rewards/accuracy_reward_step": 0.6953125,
"rewards/final_brier_reward_step": 0.8019851446151733,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8248965740203857,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.6564252972602844,
"adv/mean_abs_reasoning": 0.4691106677055359,
"adv/mean_abs_step_conf": 0.7472108602523804,
"adv/ratio_final_to_reasoning": 1.3992973139385678,
"adv/ratio_step_to_reasoning": 1.5928242772799401,
"adv/std_final_conf": 0.8560405373573303,
"adv/std_reasoning": 0.7207762598991394,
"adv/std_step_conf": 0.93504798412323,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7486601307189543,
"calib/avg_num_step_conf": 4.359375,
"calib/ece": 0.1561904761904762,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.5158730158730159,
"calib/gap": 0.3863372549019608,
"calib/mean_conf": 0.6734920634920636,
"calib/mu_c": 0.8298666666666666,
"calib/mu_w": 0.44352941176470584,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11722222222222226,
"calib/std_conf": 0.37891934364008245,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5090322580645161,
"calib/step_q_c_n": 651.0,
"calib/step_q_gap": 0.169741935483871,
"calib/step_q_w": 0.3392903225806451,
"calib/step_q_w_n": 465.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2194.0,
"completions/max_terminated_length": 2194.0,
"completions/mean_length": 466.03515625,
"completions/mean_terminated_length": 466.03515625,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.03814903274178505,
"kl": 0.067413330078125,
"learning_rate": 2.861111111111111e-06,
"loss": 0.0792,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.036202527582645416,
"mask/share_reasoning": 0.8588676452636719,
"mask/share_step_conf": 0.10492978990077972,
"num_tokens": 22914211.0,
"reward": 0.9657089710235596,
"reward_std": 0.19180062413215637,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7771941423416138,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8417237401008606,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.6895825862884521,
"adv/mean_abs_reasoning": 0.4966968595981598,
"adv/mean_abs_step_conf": 0.7532524466514587,
"adv/ratio_final_to_reasoning": 1.3883369160947419,
"adv/ratio_step_to_reasoning": 1.516523473212331,
"adv/std_final_conf": 0.8690649271011353,
"adv/std_reasoning": 0.7576682567596436,
"adv/std_step_conf": 0.9344282746315002,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7004211080405316,
"calib/avg_num_step_conf": 3.98046875,
"calib/ece": 0.252788844621514,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6254980079681275,
"calib/gap": 0.2498420844848006,
"calib/mean_conf": 0.7424302788844622,
"calib/mu_c": 0.8439597315436241,
"calib/mu_w": 0.5941176470588235,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.20079681274900407,
"calib/std_conf": 0.3647987904168045,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5504595588235294,
"calib/step_q_c_n": 544.0,
"calib/step_q_gap": 0.1363332430340557,
"calib/step_q_w": 0.4141263157894737,
"calib/step_q_w_n": 475.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2112.0,
"completions/max_terminated_length": 2112.0,
"completions/mean_length": 484.59765625,
"completions/mean_terminated_length": 484.59765625,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.1584102064371109,
"kl": 0.27259063720703125,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.0204,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037299685180187225,
"mask/share_reasoning": 0.8697381615638733,
"mask/share_step_conf": 0.09296215325593948,
"num_tokens": 23144452.0,
"reward": 0.9168623685836792,
"reward_std": 0.21771375834941864,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7018964886665344,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8216720223426819,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.7243151664733887,
"adv/mean_abs_reasoning": 0.5069431066513062,
"adv/mean_abs_step_conf": 0.7548806071281433,
"adv/ratio_final_to_reasoning": 1.4287898522932256,
"adv/ratio_step_to_reasoning": 1.4890834833806657,
"adv/std_final_conf": 0.8931503295898438,
"adv/std_reasoning": 0.7577471137046814,
"adv/std_step_conf": 0.9351648092269897,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7371172516803585,
"calib/avg_num_step_conf": 4.03125,
"calib/ece": 0.1979268292682927,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.3821138211382114,
"calib/gap": 0.3625969176454613,
"calib/mean_conf": 0.5547560975609757,
"calib/mu_c": 0.7655339805825243,
"calib/mu_w": 0.402937062937063,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.1669918699186992,
"calib/std_conf": 0.40141789026576946,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5085817307692307,
"calib/step_q_c_n": 416.0,
"calib/step_q_gap": 0.12142263986013985,
"calib/step_q_w": 0.3871590909090909,
"calib/step_q_w_n": 616.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1862.0,
"completions/max_terminated_length": 1862.0,
"completions/mean_length": 544.375,
"completions/mean_terminated_length": 546.5098266601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.1056,
"grad_norm": 0.04278083145618439,
"kl": 0.06156158447265625,
"learning_rate": 2.805555555555556e-06,
"loss": -0.0756,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03205645829439163,
"mask/share_reasoning": 0.8791579604148865,
"mask/share_step_conf": 0.08487935364246368,
"num_tokens": 23389612.0,
"reward": 0.89930260181427,
"reward_std": 0.2116352617740631,
"rewards/accuracy_reward_step": 0.40234375,
"rewards/final_brier_reward_step": 0.715812087059021,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.812480628490448,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.7242751717567444,
"adv/mean_abs_reasoning": 0.4599572420120239,
"adv/mean_abs_step_conf": 0.7632794380187988,
"adv/ratio_final_to_reasoning": 1.574657610756373,
"adv/ratio_step_to_reasoning": 1.659457376255086,
"adv/std_final_conf": 0.9085102677345276,
"adv/std_reasoning": 0.7393200397491455,
"adv/std_step_conf": 0.934822142124176,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.8057424396873938,
"calib/avg_num_step_conf": 4.38671875,
"calib/ece": 0.192377049180328,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5245901639344263,
"calib/gap": 0.4464532789670404,
"calib/mean_conf": 0.661967213114754,
"calib/mu_c": 0.8614074074074074,
"calib/mu_w": 0.414954128440367,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.15053278688524602,
"calib/std_conf": 0.404920836918677,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5353023255813953,
"calib/step_q_c_n": 645.0,
"calib/step_q_gap": 0.1520178067529434,
"calib/step_q_w": 0.38328451882845194,
"calib/step_q_w_n": 478.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2316.0,
"completions/max_terminated_length": 2316.0,
"completions/mean_length": 561.5078125,
"completions/mean_terminated_length": 561.5078125,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.033582814037799835,
"kl": 0.05889892578125,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0711,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03251664340496063,
"mask/share_reasoning": 0.8713377714157104,
"mask/share_step_conf": 0.09614555537700653,
"num_tokens": 23640766.0,
"reward": 0.9381765127182007,
"reward_std": 0.2331477701663971,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7564589977264404,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8238001465797424,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.7356261014938354,
"adv/mean_abs_reasoning": 0.5457586050033569,
"adv/mean_abs_step_conf": 0.7550551295280457,
"adv/ratio_final_to_reasoning": 1.3478964779479943,
"adv/ratio_step_to_reasoning": 1.383496517702,
"adv/std_final_conf": 0.9189450740814209,
"adv/std_reasoning": 0.7578188180923462,
"adv/std_step_conf": 0.9353813529014587,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6785857104772062,
"calib/avg_num_step_conf": 4.453125,
"calib/ece": 0.25179591836734694,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.4,
"calib/gap": 0.222920554518795,
"calib/mean_conf": 0.5887755102040816,
"calib/mu_c": 0.6988709677419355,
"calib/mu_w": 0.47595041322314047,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.16722448979591834,
"calib/std_conf": 0.3875734698122289,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4499836333878886,
"calib/step_q_c_n": 611.0,
"calib/step_q_gap": 0.013632026582595602,
"calib/step_q_w": 0.436351606805293,
"calib/step_q_w_n": 529.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2153.0,
"completions/max_terminated_length": 2153.0,
"completions/mean_length": 535.72265625,
"completions/mean_terminated_length": 535.72265625,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.058739546686410904,
"kl": 0.0675811767578125,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0847,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03145679086446762,
"mask/share_reasoning": 0.8738486766815186,
"mask/share_step_conf": 0.09469453990459442,
"num_tokens": 23884903.0,
"reward": 0.8747799396514893,
"reward_std": 0.2343277633190155,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6691246032714844,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.793716549873352,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.6112939119338989,
"adv/mean_abs_reasoning": 0.3416619896888733,
"adv/mean_abs_step_conf": 0.7474625706672668,
"adv/ratio_final_to_reasoning": 1.7891774045177218,
"adv/ratio_step_to_reasoning": 2.187725275931123,
"adv/std_final_conf": 0.8120249509811401,
"adv/std_reasoning": 0.6186192035675049,
"adv/std_step_conf": 0.9343135356903076,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8078691875319367,
"calib/avg_num_step_conf": 4.4140625,
"calib/ece": 0.16984313725490194,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5843137254901961,
"calib/gap": 0.3900453500255494,
"calib/mean_conf": 0.7272549019607844,
"calib/mu_c": 0.8848026315789475,
"calib/mu_w": 0.49475728155339804,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1505098039215686,
"calib/std_conf": 0.36354614255976975,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.582176,
"calib/step_q_c_n": 625.0,
"calib/step_q_gap": 0.13015619801980205,
"calib/step_q_w": 0.452019801980198,
"calib/step_q_w_n": 505.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1918.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 444.75390625,
"completions/mean_terminated_length": 444.75390625,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.1088,
"grad_norm": 0.03678474575281143,
"kl": 0.07524871826171875,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0463,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03831843286752701,
"mask/share_reasoning": 0.8526753187179565,
"mask/share_step_conf": 0.10900621861219406,
"num_tokens": 24105456.0,
"reward": 0.9682535529136658,
"reward_std": 0.16872161626815796,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7835359573364258,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8381274342536926,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.6591412425041199,
"adv/mean_abs_reasoning": 0.47897109389305115,
"adv/mean_abs_step_conf": 0.7331361174583435,
"adv/ratio_final_to_reasoning": 1.376160797401479,
"adv/ratio_step_to_reasoning": 1.5306479384788187,
"adv/std_final_conf": 0.8299095034599304,
"adv/std_reasoning": 0.7393709421157837,
"adv/std_step_conf": 0.9341103434562683,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8233881578947366,
"calib/avg_num_step_conf": 4.4375,
"calib/ece": 0.18261507936507945,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5753968253968254,
"calib/gap": 0.37095657894736844,
"calib/mean_conf": 0.7305515873015873,
"calib/mu_c": 0.8777565789473685,
"calib/mu_w": 0.5068,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.15499603174603183,
"calib/std_conf": 0.3600417983305542,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.578041074249605,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.1569675156014937,
"calib/step_q_w": 0.42107355864811136,
"calib/step_q_w_n": 503.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 557.08203125,
"completions/mean_terminated_length": 557.08203125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.039522334933280945,
"kl": 0.06630706787109375,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.0137,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032912421971559525,
"mask/share_reasoning": 0.8734282851219177,
"mask/share_step_conf": 0.09365926682949066,
"num_tokens": 24352621.0,
"reward": 0.9609041213989258,
"reward_std": 0.20713165402412415,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7670042514801025,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8430851101875305,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.6705017685890198,
"adv/mean_abs_reasoning": 0.47117331624031067,
"adv/mean_abs_step_conf": 0.7483392953872681,
"adv/ratio_final_to_reasoning": 1.4230469881852275,
"adv/ratio_step_to_reasoning": 1.588246340770273,
"adv/std_final_conf": 0.8786208033561707,
"adv/std_reasoning": 0.7391869425773621,
"adv/std_step_conf": 0.9346818327903748,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.7892615384615386,
"calib/avg_num_step_conf": 4.65625,
"calib/ece": 0.23494117647058824,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5333333333333333,
"calib/gap": 0.3396707692307692,
"calib/mean_conf": 0.6882745098039217,
"calib/mu_c": 0.86144,
"calib/mu_w": 0.5217692307692308,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21650980392156863,
"calib/std_conf": 0.37431055987321926,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5891935483870967,
"calib/step_q_c_n": 558.0,
"calib/step_q_gap": 0.16376768087921023,
"calib/step_q_w": 0.42542586750788647,
"calib/step_q_w_n": 634.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1115.0,
"completions/max_terminated_length": 1115.0,
"completions/mean_length": 491.43359375,
"completions/mean_terminated_length": 493.3608093261719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.04268510267138481,
"kl": 0.07636260986328125,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0347,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03305329009890556,
"mask/share_reasoning": 0.8618713617324829,
"mask/share_step_conf": 0.10116907954216003,
"num_tokens": 24585108.0,
"reward": 0.9279952049255371,
"reward_std": 0.17701026797294617,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7343195676803589,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8263583183288574,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.7147749662399292,
"adv/mean_abs_reasoning": 0.562514066696167,
"adv/mean_abs_step_conf": 0.7437756061553955,
"adv/ratio_final_to_reasoning": 1.2706792746322617,
"adv/ratio_step_to_reasoning": 1.3222346785456194,
"adv/std_final_conf": 0.8912851214408875,
"adv/std_reasoning": 0.7929922938346863,
"adv/std_step_conf": 0.9355881214141846,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.719147005444646,
"calib/avg_num_step_conf": 4.55859375,
"calib/ece": 0.25951807228915647,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5502008032128514,
"calib/gap": 0.28966554316826554,
"calib/mean_conf": 0.6885140562248996,
"calib/mu_c": 0.8234586466165414,
"calib/mu_w": 0.5337931034482759,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2069477911646585,
"calib/std_conf": 0.39121759370677045,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5546408839779005,
"calib/step_q_c_n": 543.0,
"calib/step_q_gap": 0.13257357628559274,
"calib/step_q_w": 0.4220673076923077,
"calib/step_q_w_n": 624.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1825.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 523.828125,
"completions/mean_terminated_length": 525.8823852539062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.112,
"grad_norm": 0.028385218232870102,
"kl": 0.07004547119140625,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.0132,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03344562277197838,
"mask/share_reasoning": 0.8649731278419495,
"mask/share_step_conf": 0.09767502546310425,
"num_tokens": 24824968.0,
"reward": 0.9044659733772278,
"reward_std": 0.2518778443336487,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6948882341384888,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8163872957229614,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6365981101989746,
"adv/mean_abs_reasoning": 0.4709445834159851,
"adv/mean_abs_step_conf": 0.7567065954208374,
"adv/ratio_final_to_reasoning": 1.3517473873070704,
"adv/ratio_step_to_reasoning": 1.606784793939203,
"adv/std_final_conf": 0.8377167582511902,
"adv/std_reasoning": 0.7574999928474426,
"adv/std_step_conf": 0.9346597194671631,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7537356321839083,
"calib/avg_num_step_conf": 4.48046875,
"calib/ece": 0.3117928286852589,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.7689243027888446,
"calib/gap": 0.2719022988505747,
"calib/mean_conf": 0.8416733067729083,
"calib/mu_c": 0.9673333333333334,
"calib/mu_w": 0.6954310344827587,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.307808764940239,
"calib/std_conf": 0.3089456250663391,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6424213836477988,
"calib/step_q_c_n": 636.0,
"calib/step_q_gap": 0.1433998572290121,
"calib/step_q_w": 0.49902152641878667,
"calib/step_q_w_n": 511.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2232.0,
"completions/max_terminated_length": 2232.0,
"completions/mean_length": 493.40234375,
"completions/mean_terminated_length": 493.40234375,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.046452466398477554,
"kl": 0.0757904052734375,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.0227,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.033236440271139145,
"mask/share_reasoning": 0.8643213510513306,
"mask/share_step_conf": 0.10244220495223999,
"num_tokens": 25055863.0,
"reward": 0.8876609206199646,
"reward_std": 0.22679907083511353,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6773473024368286,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7987557649612427,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.6299135684967041,
"adv/mean_abs_reasoning": 0.5634945034980774,
"adv/mean_abs_step_conf": 0.726954996585846,
"adv/ratio_final_to_reasoning": 1.117869942983132,
"adv/ratio_step_to_reasoning": 1.2900835626133598,
"adv/std_final_conf": 0.8326076865196228,
"adv/std_reasoning": 0.8264657258987427,
"adv/std_step_conf": 0.9348495602607727,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6389862409138111,
"calib/avg_num_step_conf": 4.92578125,
"calib/ece": 0.3347410358565736,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8167330677290837,
"calib/gap": 0.15670430944963665,
"calib/mean_conf": 0.8908366533864542,
"calib/mu_c": 0.9576388888888889,
"calib/mu_w": 0.8009345794392523,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.32593625498007955,
"calib/std_conf": 0.25053507919555956,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.6303727714748785,
"calib/step_q_c_n": 617.0,
"calib/step_q_gap": 0.15694109445624488,
"calib/step_q_w": 0.4734316770186336,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2065.0,
"completions/max_terminated_length": 2065.0,
"completions/mean_length": 486.8671875,
"completions/mean_terminated_length": 486.8671875,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.11413333333333334,
"grad_norm": 212991.3125,
"kl": 659456.0815200806,
"learning_rate": 2.5833333333333337e-06,
"loss": 14848.5596,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034113090485334396,
"mask/share_reasoning": 0.855882465839386,
"mask/share_step_conf": 0.11000443249940872,
"num_tokens": 25285117.0,
"reward": 0.8787997961044312,
"reward_std": 0.23736529052257538,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6400562524795532,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8120745420455933,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.6507050395011902,
"adv/mean_abs_reasoning": 0.5196617245674133,
"adv/mean_abs_step_conf": 0.7388439178466797,
"adv/ratio_final_to_reasoning": 1.2521704192142733,
"adv/ratio_step_to_reasoning": 1.4217785973398793,
"adv/std_final_conf": 0.8490867018699646,
"adv/std_reasoning": 0.7753031849861145,
"adv/std_step_conf": 0.9350075125694275,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6293950719822813,
"calib/avg_num_step_conf": 4.96875,
"calib/ece": 0.29661417322834643,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.7992125984251969,
"calib/gap": 0.09142995570321144,
"calib/mean_conf": 0.8880314960629921,
"calib/mu_c": 0.9189880952380953,
"calib/mu_w": 0.8275581395348839,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.26161417322834646,
"calib/std_conf": 0.2564599686957617,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.644311814859927,
"calib/step_q_c_n": 821.0,
"calib/step_q_gap": 0.0921388658577097,
"calib/step_q_w": 0.5521729490022173,
"calib/step_q_w_n": 451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1779.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 509.99609375,
"completions/mean_terminated_length": 511.99609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.1152,
"grad_norm": 0.03606853261590004,
"kl": 0.08648681640625,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.0144,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.033813901245594025,
"mask/share_reasoning": 0.8542295694351196,
"mask/share_step_conf": 0.10805031657218933,
"num_tokens": 25518908.0,
"reward": 0.9265960454940796,
"reward_std": 0.22009024024009705,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.6905062794685364,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8329982757568359,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.637611448764801,
"adv/mean_abs_reasoning": 0.47797125577926636,
"adv/mean_abs_step_conf": 0.7113043069839478,
"adv/ratio_final_to_reasoning": 1.333995383729223,
"adv/ratio_step_to_reasoning": 1.488173814603692,
"adv/std_final_conf": 0.8487848043441772,
"adv/std_reasoning": 0.7752171754837036,
"adv/std_step_conf": 0.9350650906562805,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8020752016653656,
"calib/avg_num_step_conf": 5.02734375,
"calib/ece": 0.263991935483871,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.6451612903225806,
"calib/gap": 0.41911657559198556,
"calib/mean_conf": 0.7417338709677419,
"calib/mu_c": 0.954672131147541,
"calib/mu_w": 0.5355555555555555,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.25689516129032264,
"calib/std_conf": 0.3758589862733562,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6456351791530945,
"calib/step_q_c_n": 614.0,
"calib/step_q_gap": 0.1917124451263486,
"calib/step_q_w": 0.45392273402674593,
"calib/step_q_w_n": 673.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2007.0,
"completions/max_terminated_length": 2007.0,
"completions/mean_length": 518.18359375,
"completions/mean_terminated_length": 518.18359375,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.03923717141151428,
"kl": 0.0962982177734375,
"learning_rate": 2.5277777777777778e-06,
"loss": 0.0092,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.031188976019620895,
"mask/share_reasoning": 0.8646241426467896,
"mask/share_step_conf": 0.10418689250946045,
"num_tokens": 25756163.0,
"reward": 0.925527811050415,
"reward_std": 0.2139080911874771,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7322777509689331,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8297152519226074,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.7360544204711914,
"adv/mean_abs_reasoning": 0.5907687544822693,
"adv/mean_abs_step_conf": 0.752666175365448,
"adv/ratio_final_to_reasoning": 1.2459264558029068,
"adv/ratio_step_to_reasoning": 1.274045334413565,
"adv/std_final_conf": 0.9061128497123718,
"adv/std_reasoning": 0.8266500234603882,
"adv/std_step_conf": 0.9360246062278748,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6578307712521352,
"calib/avg_num_step_conf": 3.92578125,
"calib/ece": 0.35874493927125506,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.7044534412955465,
"calib/gap": 0.14679477072657987,
"calib/mean_conf": 0.8360728744939271,
"calib/mu_c": 0.906201550387597,
"calib/mu_w": 0.7594067796610171,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3362753036437247,
"calib/std_conf": 0.2872816240754243,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.6610315789473684,
"calib/step_q_c_n": 475.0,
"calib/step_q_gap": 0.10944667328699098,
"calib/step_q_w": 0.5515849056603774,
"calib/step_q_w_n": 530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1401.0,
"completions/max_terminated_length": 1401.0,
"completions/mean_length": 460.0546875,
"completions/mean_terminated_length": 460.0546875,
"completions/min_length": 82.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.032077256590127945,
"kl": 0.1114044189453125,
"learning_rate": 2.5e-06,
"loss": -0.1208,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.0361558198928833,
"mask/share_reasoning": 0.867607057094574,
"mask/share_step_conf": 0.09623715281486511,
"num_tokens": 25978857.0,
"reward": 0.816313624382019,
"reward_std": 0.2857520282268524,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6051039099693298,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7368983030319214,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.725448727607727,
"adv/mean_abs_reasoning": 0.6362229585647583,
"adv/mean_abs_step_conf": 0.7552950382232666,
"adv/ratio_final_to_reasoning": 1.1402429255999362,
"adv/ratio_step_to_reasoning": 1.1871546413966583,
"adv/std_final_conf": 0.9021727442741394,
"adv/std_reasoning": 0.8748306632041931,
"adv/std_step_conf": 0.9352055191993713,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.6703575782202357,
"calib/avg_num_step_conf": 3.89453125,
"calib/ece": 0.28967346938775507,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.7428571428571429,
"calib/gap": 0.2398273059731817,
"calib/mean_conf": 0.8206938775510204,
"calib/mu_c": 0.9254347826086956,
"calib/mu_w": 0.6856074766355139,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.2735510204081632,
"calib/std_conf": 0.3240142478652111,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.6638683127572016,
"calib/step_q_c_n": 486.0,
"calib/step_q_gap": 0.13273328340299417,
"calib/step_q_w": 0.5311350293542074,
"calib/step_q_w_n": 511.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2159.0,
"completions/max_terminated_length": 2159.0,
"completions/mean_length": 498.30078125,
"completions/mean_terminated_length": 498.30078125,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.1184,
"grad_norm": 0.04491841420531273,
"kl": 0.11196136474609375,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.0422,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.0340171717107296,
"mask/share_reasoning": 0.8737939596176147,
"mask/share_step_conf": 0.09218887984752655,
"num_tokens": 26213830.0,
"reward": 0.8551008105278015,
"reward_std": 0.31432783603668213,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6475656032562256,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7696672081947327,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.7325797080993652,
"adv/mean_abs_reasoning": 0.6655578017234802,
"adv/mean_abs_step_conf": 0.7522399425506592,
"adv/ratio_final_to_reasoning": 1.1007003542026401,
"adv/ratio_step_to_reasoning": 1.1302398388279924,
"adv/std_final_conf": 0.8928104639053345,
"adv/std_reasoning": 0.874744176864624,
"adv/std_step_conf": 0.9354557394981384,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7793906196508046,
"calib/avg_num_step_conf": 3.77734375,
"calib/ece": 0.2210330578512397,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.5206611570247934,
"calib/gap": 0.3675172885997943,
"calib/mean_conf": 0.6674793388429752,
"calib/mu_c": 0.8421259842519683,
"calib/mu_w": 0.47460869565217395,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.18185950413223143,
"calib/std_conf": 0.39885042659655895,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.6290947368421054,
"calib/step_q_c_n": 475.0,
"calib/step_q_gap": 0.17698091570389407,
"calib/step_q_w": 0.4521138211382113,
"calib/step_q_w_n": 492.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2190.0,
"completions/max_terminated_length": 2190.0,
"completions/mean_length": 548.1953125,
"completions/mean_terminated_length": 552.5117797851562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.02175419218838215,
"kl": 0.1053009033203125,
"learning_rate": 2.4444444444444447e-06,
"loss": -0.085,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.029369987547397614,
"mask/share_reasoning": 0.8855926990509033,
"mask/share_step_conf": 0.07722484320402145,
"num_tokens": 26462088.0,
"reward": 0.8555378317832947,
"reward_std": 0.2852292060852051,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.685128927230835,
"rewards/format_reward_step": 0.9140625,
"rewards/step_l2_reward": 0.7431342005729675,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.7447386384010315,
"adv/mean_abs_reasoning": 0.5864541530609131,
"adv/mean_abs_step_conf": 0.7527590394020081,
"adv/ratio_final_to_reasoning": 1.2699008686595112,
"adv/ratio_step_to_reasoning": 1.2835769607446559,
"adv/std_final_conf": 0.911855161190033,
"adv/std_reasoning": 0.8265925049781799,
"adv/std_step_conf": 0.9350982904434204,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.749966078697422,
"calib/avg_num_step_conf": 4.12890625,
"calib/ece": 0.23784153005464478,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5778688524590164,
"calib/gap": 0.2594970601537766,
"calib/mean_conf": 0.7608743169398906,
"calib/mu_c": 0.8778606965174128,
"calib/mu_w": 0.6183636363636362,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.2247677595628415,
"calib/std_conf": 0.322982121031701,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.6289224393132031,
"calib/step_q_c_n": 563.0,
"calib/step_q_gap": 0.14236373485976184,
"calib/step_q_w": 0.4865587044534413,
"calib/step_q_w_n": 494.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2107.0,
"completions/max_terminated_length": 2107.0,
"completions/mean_length": 461.3515625,
"completions/mean_terminated_length": 461.3515625,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.02832408808171749,
"kl": 0.132415771484375,
"learning_rate": 2.4166666666666667e-06,
"loss": -0.0059,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03460315614938736,
"mask/share_reasoning": 0.8678891062736511,
"mask/share_step_conf": 0.0975077673792839,
"num_tokens": 26685394.0,
"reward": 0.8980043530464172,
"reward_std": 0.2618659436702728,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6935716867446899,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8071244955062866,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.6925190091133118,
"adv/mean_abs_reasoning": 0.548272967338562,
"adv/mean_abs_step_conf": 0.7386399507522583,
"adv/ratio_final_to_reasoning": 1.263091653916391,
"adv/ratio_step_to_reasoning": 1.3472120544950075,
"adv/std_final_conf": 0.8970476984977722,
"adv/std_reasoning": 0.7929205298423767,
"adv/std_step_conf": 0.9356428384780884,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7543706293706295,
"calib/avg_num_step_conf": 4.05859375,
"calib/ece": 0.23186234817813772,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.6720647773279352,
"calib/gap": 0.34546328671328674,
"calib/mean_conf": 0.7889473684210526,
"calib/mu_c": 0.9344055944055945,
"calib/mu_w": 0.5889423076923077,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.22093117408906887,
"calib/std_conf": 0.33682205152856126,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.6588998357963876,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.14983006835452717,
"calib/step_q_w": 0.5090697674418604,
"calib/step_q_w_n": 430.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1543.0,
"completions/max_terminated_length": 1543.0,
"completions/mean_length": 474.8203125,
"completions/mean_terminated_length": 474.8203125,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.1216,
"grad_norm": 0.03796344995498657,
"kl": 0.1222076416015625,
"learning_rate": 2.388888888888889e-06,
"loss": -0.075,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.035458482801914215,
"mask/share_reasoning": 0.8631207346916199,
"mask/share_step_conf": 0.1014208048582077,
"num_tokens": 26911972.0,
"reward": 0.9028903245925903,
"reward_std": 0.2743861675262451,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7213600873947144,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7859828472137451,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.777995228767395,
"adv/mean_abs_reasoning": 0.5828838348388672,
"adv/mean_abs_step_conf": 0.742296040058136,
"adv/ratio_final_to_reasoning": 1.3347346113698015,
"adv/ratio_step_to_reasoning": 1.2734888080458378,
"adv/std_final_conf": 0.921405017375946,
"adv/std_reasoning": 0.8100579977035522,
"adv/std_step_conf": 0.9360432624816895,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6451282051282051,
"calib/avg_num_step_conf": 3.8359375,
"calib/ece": 0.34812,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.616,
"calib/gap": 0.1600448717948718,
"calib/mean_conf": 0.7616400000000001,
"calib/mu_c": 0.8384615384615385,
"calib/mu_w": 0.6784166666666667,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.29488000000000003,
"calib/std_conf": 0.3515487311881526,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.6039960238568589,
"calib/step_q_c_n": 503.0,
"calib/step_q_gap": 0.05746157709276711,
"calib/step_q_w": 0.5465344467640918,
"calib/step_q_w_n": 479.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1274.0,
"completions/max_terminated_length": 1274.0,
"completions/mean_length": 463.61328125,
"completions/mean_terminated_length": 465.431396484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.034675490111112595,
"kl": 0.124359130859375,
"learning_rate": 2.361111111111111e-06,
"loss": -0.1689,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03430056944489479,
"mask/share_reasoning": 0.8711838722229004,
"mask/share_step_conf": 0.09060931950807571,
"num_tokens": 27135921.0,
"reward": 0.8092306852340698,
"reward_std": 0.3144776523113251,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.5996066331863403,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7313545942306519,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.7690123319625854,
"adv/mean_abs_reasoning": 0.6008901000022888,
"adv/mean_abs_step_conf": 0.7869776487350464,
"adv/ratio_final_to_reasoning": 1.2797886534653446,
"adv/ratio_step_to_reasoning": 1.3096864946386182,
"adv/std_final_conf": 0.922232985496521,
"adv/std_reasoning": 0.8430724143981934,
"adv/std_step_conf": 0.9357346892356873,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7209302325581395,
"calib/avg_num_step_conf": 3.8828125,
"calib/ece": 0.28441250000000007,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.5125,
"calib/gap": 0.25619589356798655,
"calib/mean_conf": 0.7060875000000001,
"calib/mu_c": 0.8437927927927928,
"calib/mu_w": 0.5875968992248063,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.264,
"calib/std_conf": 0.35327000510999607,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.6702894736842107,
"calib/step_q_c_n": 380.0,
"calib/step_q_gap": 0.15883996228355923,
"calib/step_q_w": 0.5114495114006514,
"calib/step_q_w_n": 614.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3003.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 531.0859375,
"completions/mean_terminated_length": 533.1686401367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.06119803339242935,
"kl": 0.111297607421875,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0462,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.03346026688814163,
"mask/share_reasoning": 0.8729349374771118,
"mask/share_step_conf": 0.08969855308532715,
"num_tokens": 27376399.0,
"reward": 0.823566198348999,
"reward_std": 0.31112343072891235,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6282831430435181,
"rewards/format_reward_step": 0.90625,
"rewards/step_l2_reward": 0.7500991821289062,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.7061994075775146,
"adv/mean_abs_reasoning": 0.5571328401565552,
"adv/mean_abs_step_conf": 0.7145150303840637,
"adv/ratio_final_to_reasoning": 1.2675601879420204,
"adv/ratio_step_to_reasoning": 1.282485933127338,
"adv/std_final_conf": 0.8947641849517822,
"adv/std_reasoning": 0.7929291129112244,
"adv/std_step_conf": 0.9358139038085938,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7025375939849624,
"calib/avg_num_step_conf": 3.85546875,
"calib/ece": 0.256,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.46938775510204084,
"calib/gap": 0.28945018796992494,
"calib/mean_conf": 0.6979591836734694,
"calib/mu_c": 0.8550892857142858,
"calib/mu_w": 0.5656390977443608,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.24840816326530615,
"calib/std_conf": 0.34754873152265575,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.6135507246376812,
"calib/step_q_c_n": 414.0,
"calib/step_q_gap": 0.09999051521359748,
"calib/step_q_w": 0.5135602094240838,
"calib/step_q_w_n": 573.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1972.0,
"completions/max_terminated_length": 1972.0,
"completions/mean_length": 478.453125,
"completions/mean_terminated_length": 478.453125,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.1248,
"grad_norm": 0.05510564520955086,
"kl": 0.1254730224609375,
"learning_rate": 2.305555555555556e-06,
"loss": -0.0354,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03364469110965729,
"mask/share_reasoning": 0.8776097893714905,
"mask/share_step_conf": 0.08874553442001343,
"num_tokens": 27605483.0,
"reward": 0.8502526879310608,
"reward_std": 0.25346314907073975,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.664110541343689,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7621760368347168,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.7624117732048035,
"adv/mean_abs_reasoning": 0.5823904871940613,
"adv/mean_abs_step_conf": 0.7789553999900818,
"adv/ratio_final_to_reasoning": 1.3091075317491516,
"adv/ratio_step_to_reasoning": 1.337513948318531,
"adv/std_final_conf": 0.9246702194213867,
"adv/std_reasoning": 0.8266401290893555,
"adv/std_step_conf": 0.9355471730232239,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7529248875043268,
"calib/avg_num_step_conf": 4.453125,
"calib/ece": 0.18322314049586783,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.4380165289256198,
"calib/gap": 0.3315167878158532,
"calib/mean_conf": 0.6451239669421488,
"calib/mu_c": 0.7917037037037037,
"calib/mu_w": 0.4601869158878505,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.13524793388429757,
"calib/std_conf": 0.37549001748014366,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5426194398682043,
"calib/step_q_c_n": 607.0,
"calib/step_q_gap": 0.11141868939916116,
"calib/step_q_w": 0.4312007504690431,
"calib/step_q_w_n": 533.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2711.0,
"completions/max_terminated_length": 2711.0,
"completions/mean_length": 522.21875,
"completions/mean_terminated_length": 522.21875,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.027424253523349762,
"kl": 0.1253204345703125,
"learning_rate": 2.277777777777778e-06,
"loss": -0.0927,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.032755836844444275,
"mask/share_reasoning": 0.8705406785011292,
"mask/share_step_conf": 0.09670349210500717,
"num_tokens": 27843179.0,
"reward": 0.8873322010040283,
"reward_std": 0.26377955079078674,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7093136310577393,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7731631994247437,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.7559334635734558,
"adv/mean_abs_reasoning": 0.683125376701355,
"adv/mean_abs_step_conf": 0.7430295944213867,
"adv/ratio_final_to_reasoning": 1.1065808552211502,
"adv/ratio_step_to_reasoning": 1.0876913956985386,
"adv/std_final_conf": 0.9073584079742432,
"adv/std_reasoning": 0.8749279379844666,
"adv/std_step_conf": 0.936138391494751,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.7755255255255257,
"calib/avg_num_step_conf": 3.39453125,
"calib/ece": 0.19614718614718613,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.354978354978355,
"calib/gap": 0.3422342342342343,
"calib/mean_conf": 0.5531168831168831,
"calib/mu_c": 0.730900900900901,
"calib/mu_w": 0.38866666666666666,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.90234375,
"calib/pce": 0.13437229437229434,
"calib/std_conf": 0.391311236963411,
"calib/step_conf_rate": 0.90234375,
"calib/step_q_c": 0.560694789081886,
"calib/step_q_c_n": 403.0,
"calib/step_q_gap": 0.1305231152621435,
"calib/step_q_w": 0.4301716738197425,
"calib/step_q_w_n": 466.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2184.0,
"completions/max_terminated_length": 2184.0,
"completions/mean_length": 526.33984375,
"completions/mean_terminated_length": 526.33984375,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.035477571189403534,
"kl": 0.1305389404296875,
"learning_rate": 2.25e-06,
"loss": -0.1697,
"mask/has_final_conf_rate": 0.90234375,
"mask/share_final_conf": 0.031598348170518875,
"mask/share_reasoning": 0.8906428217887878,
"mask/share_step_conf": 0.07775881886482239,
"num_tokens": 28082986.0,
"reward": 0.8124796748161316,
"reward_std": 0.3323761820793152,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.645050048828125,
"rewards/format_reward_step": 0.8515625,
"rewards/step_l2_reward": 0.7189717292785645,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.7713155746459961,
"adv/mean_abs_reasoning": 0.6857198476791382,
"adv/mean_abs_step_conf": 0.7484744787216187,
"adv/ratio_final_to_reasoning": 1.1248260893374489,
"adv/ratio_step_to_reasoning": 1.0915164279623486,
"adv/std_final_conf": 0.9290948510169983,
"adv/std_reasoning": 0.8750848770141602,
"adv/std_step_conf": 0.9358519315719604,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.7467312539382482,
"calib/avg_num_step_conf": 3.28515625,
"calib/ece": 0.18213043478260874,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.34347826086956523,
"calib/gap": 0.3489855072463767,
"calib/mean_conf": 0.5565652173913043,
"calib/mu_c": 0.696159420289855,
"calib/mu_w": 0.3471739130434783,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.0693478260869566,
"calib/std_conf": 0.388440508128835,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_c": 0.5378112449799196,
"calib/step_q_c_n": 498.0,
"calib/step_q_gap": 0.096907454892456,
"calib/step_q_w": 0.4409037900874636,
"calib/step_q_w_n": 343.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2358.0,
"completions/max_terminated_length": 2358.0,
"completions/mean_length": 455.015625,
"completions/mean_terminated_length": 456.8000183105469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.128,
"grad_norm": 0.0382017083466053,
"kl": 0.14501953125,
"learning_rate": 2.222222222222222e-06,
"loss": -0.2528,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.0328923761844635,
"mask/share_reasoning": 0.8787045478820801,
"mask/share_step_conf": 0.08449685573577881,
"num_tokens": 28306158.0,
"reward": 0.8309608697891235,
"reward_std": 0.33806800842285156,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6615766286849976,
"rewards/format_reward_step": 0.8515625,
"rewards/step_l2_reward": 0.7190951108932495,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.8134666681289673,
"adv/mean_abs_reasoning": 0.725548267364502,
"adv/mean_abs_step_conf": 0.7782011032104492,
"adv/ratio_final_to_reasoning": 1.121175123308918,
"adv/ratio_step_to_reasoning": 1.07256972170467,
"adv/std_final_conf": 0.9362523555755615,
"adv/std_reasoning": 0.8904957175254822,
"adv/std_step_conf": 0.9359607696533203,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6404618904618905,
"calib/avg_num_step_conf": 3.74609375,
"calib/ece": 0.23759493670886078,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.19831223628691982,
"calib/gap": 0.17561347061347055,
"calib/mean_conf": 0.4868776371308017,
"calib/mu_c": 0.569126984126984,
"calib/mu_w": 0.3935135135135135,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.09641350210970469,
"calib/std_conf": 0.35624441339302676,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.4792,
"calib/step_q_c_n": 494.0,
"calib/step_q_gap": 0.0198236559139785,
"calib/step_q_w": 0.4593763440860215,
"calib/step_q_w_n": 465.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2327.0,
"completions/max_terminated_length": 2327.0,
"completions/mean_length": 548.76171875,
"completions/mean_terminated_length": 553.0827026367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.025058355182409286,
"kl": 0.12908935546875,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.2633,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.029778383672237396,
"mask/share_reasoning": 0.8835940361022949,
"mask/share_step_conf": 0.0788150429725647,
"num_tokens": 28551697.0,
"reward": 0.8246980309486389,
"reward_std": 0.3075857162475586,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6279492378234863,
"rewards/format_reward_step": 0.890625,
"rewards/step_l2_reward": 0.7425405383110046,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.7894877791404724,
"adv/mean_abs_reasoning": 0.7466657161712646,
"adv/mean_abs_step_conf": 0.7526766657829285,
"adv/ratio_final_to_reasoning": 1.0573510501979249,
"adv/ratio_step_to_reasoning": 1.0080503891922166,
"adv/std_final_conf": 0.9356542825698853,
"adv/std_reasoning": 0.9207422733306885,
"adv/std_step_conf": 0.9358314871788025,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.782716049382716,
"calib/avg_num_step_conf": 3.5703125,
"calib/ece": 0.16097692307692313,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.8828125,
"calib/frac_conf_gt_0.9": 0.2948717948717949,
"calib/gap": 0.3536087542087542,
"calib/mean_conf": 0.563211111111111,
"calib/mu_c": 0.7128148148148148,
"calib/mu_w": 0.3592060606060606,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.07363247863247867,
"calib/std_conf": 0.3718917056499702,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.5741482965931863,
"calib/step_q_c_n": 499.0,
"calib/step_q_gap": 0.13718444117149964,
"calib/step_q_w": 0.4369638554216867,
"calib/step_q_w_n": 415.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2356.0,
"completions/max_terminated_length": 2356.0,
"completions/mean_length": 506.76171875,
"completions/mean_terminated_length": 506.76171875,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.030653269961476326,
"kl": 0.1387176513671875,
"learning_rate": 2.166666666666667e-06,
"loss": -0.1761,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.03297232836484909,
"mask/share_reasoning": 0.8866724371910095,
"mask/share_step_conf": 0.08035525679588318,
"num_tokens": 28788772.0,
"reward": 0.8681167364120483,
"reward_std": 0.32529065012931824,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.6960824728012085,
"rewards/format_reward_step": 0.8828125,
"rewards/step_l2_reward": 0.7573385238647461,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7737333178520203,
"adv/mean_abs_reasoning": 0.6747977137565613,
"adv/mean_abs_step_conf": 0.7366330623626709,
"adv/ratio_final_to_reasoning": 1.1466152034580703,
"adv/ratio_step_to_reasoning": 1.0916353854577776,
"adv/std_final_conf": 0.9265235066413879,
"adv/std_reasoning": 0.8904935121536255,
"adv/std_step_conf": 0.9361698627471924,
"calib/answer_extract_rate": 0.91015625,
"calib/auroc": 0.7301136363636364,
"calib/avg_num_step_conf": 3.66796875,
"calib/ece": 0.15086956521739134,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.19130434782608696,
"calib/gap": 0.28080303030303033,
"calib/mean_conf": 0.47313043478260874,
"calib/mu_c": 0.6196363636363637,
"calib/mu_w": 0.3388333333333333,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.07286956521739132,
"calib/std_conf": 0.34455734732474236,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_c": 0.5267430025445292,
"calib/step_q_c_n": 393.0,
"calib/step_q_gap": 0.11906900987053654,
"calib/step_q_w": 0.4076739926739927,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2523.0,
"completions/max_terminated_length": 2523.0,
"completions/mean_length": 550.48046875,
"completions/mean_terminated_length": 552.6392211914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1312,
"grad_norm": 0.034598927944898605,
"kl": 0.1342010498046875,
"learning_rate": 2.138888888888889e-06,
"loss": -0.3169,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.029137644916772842,
"mask/share_reasoning": 0.8917558193206787,
"mask/share_step_conf": 0.07520025223493576,
"num_tokens": 29034983.0,
"reward": 0.816436767578125,
"reward_std": 0.31139829754829407,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6586597561836243,
"rewards/format_reward_step": 0.8515625,
"rewards/step_l2_reward": 0.7171823978424072,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.777886152267456,
"adv/mean_abs_reasoning": 0.6544324159622192,
"adv/mean_abs_step_conf": 0.7390900254249573,
"adv/ratio_final_to_reasoning": 1.188642453054104,
"adv/ratio_step_to_reasoning": 1.1293603547102187,
"adv/std_final_conf": 0.9364475607872009,
"adv/std_reasoning": 0.8749727010726929,
"adv/std_step_conf": 0.9361278414726257,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.7035381610576923,
"calib/avg_num_step_conf": 3.421875,
"calib/ece": 0.19879310344827583,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.22844827586206898,
"calib/gap": 0.24277043269230764,
"calib/mean_conf": 0.495,
"calib/mu_c": 0.6038281249999999,
"calib/mu_w": 0.3610576923076923,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.8828125,
"calib/pce": 0.07103448275862068,
"calib/std_conf": 0.35718004732473024,
"calib/step_conf_rate": 0.8828125,
"calib/step_q_c": 0.5541610738255034,
"calib/step_q_c_n": 447.0,
"calib/step_q_gap": 0.09642214608657562,
"calib/step_q_w": 0.45773892773892777,
"calib/step_q_w_n": 429.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2212.0,
"completions/max_terminated_length": 2212.0,
"completions/mean_length": 500.01953125,
"completions/mean_terminated_length": 500.01953125,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.04210676625370979,
"kl": 0.1402130126953125,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.2335,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.03087177686393261,
"mask/share_reasoning": 0.8886597156524658,
"mask/share_step_conf": 0.08046852797269821,
"num_tokens": 29269804.0,
"reward": 0.80426025390625,
"reward_std": 0.3211674094200134,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6273288726806641,
"rewards/format_reward_step": 0.8515625,
"rewards/step_l2_reward": 0.7061915993690491,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.7988862991333008,
"adv/mean_abs_reasoning": 0.6417537927627563,
"adv/mean_abs_step_conf": 0.7917139530181885,
"adv/ratio_final_to_reasoning": 1.2448485823419717,
"adv/ratio_step_to_reasoning": 1.2336724175946856,
"adv/std_final_conf": 0.9355748295783997,
"adv/std_reasoning": 0.8432921767234802,
"adv/std_step_conf": 0.9359493851661682,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.6413651188072682,
"calib/avg_num_step_conf": 3.40625,
"calib/ece": 0.25601731601731603,
"calib/final_conf_rate": 0.90234375,
"calib/format_rate": 0.8828125,
"calib/frac_conf_gt_0.9": 0.22510822510822512,
"calib/gap": 0.16824662214629604,
"calib/mean_conf": 0.4827705627705628,
"calib/mu_c": 0.5825531914893618,
"calib/mu_w": 0.41430656934306576,
"calib/nonempty_final_conf_rate": 0.90234375,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.90625,
"calib/pce": 0.16593073593073593,
"calib/std_conf": 0.36907528555320096,
"calib/step_conf_rate": 0.90625,
"calib/step_q_c": 0.5112790697674419,
"calib/step_q_c_n": 344.0,
"calib/step_q_gap": 0.0478889182522903,
"calib/step_q_w": 0.46339015151515156,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 551.2265625,
"completions/mean_terminated_length": 551.2265625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.04054722189903259,
"kl": 0.340728759765625,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.2266,
"mask/has_final_conf_rate": 0.90234375,
"mask/share_final_conf": 0.031965482980012894,
"mask/share_reasoning": 0.8885842561721802,
"mask/share_step_conf": 0.07945021241903305,
"num_tokens": 29515726.0,
"reward": 0.7934260964393616,
"reward_std": 0.30349084734916687,
"rewards/accuracy_reward_step": 0.375,
"rewards/final_brier_reward_step": 0.6105921864509583,
"rewards/format_reward_step": 0.8828125,
"rewards/step_l2_reward": 0.724697470664978,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.7436902523040771,
"adv/mean_abs_reasoning": 0.5532766580581665,
"adv/mean_abs_step_conf": 0.8003737926483154,
"adv/ratio_final_to_reasoning": 1.3441562037231152,
"adv/ratio_step_to_reasoning": 1.4466068304008795,
"adv/std_final_conf": 0.8976423144340515,
"adv/std_reasoning": 0.7931344509124756,
"adv/std_step_conf": 0.934795618057251,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7650679117147707,
"calib/avg_num_step_conf": 3.75390625,
"calib/ece": 0.19609243697478987,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.36134453781512604,
"calib/gap": 0.36420486700622534,
"calib/mean_conf": 0.5386134453781513,
"calib/mu_c": 0.7130645161290323,
"calib/mu_w": 0.348859649122807,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.10684873949579826,
"calib/std_conf": 0.4026552753207035,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.5576223776223777,
"calib/step_q_c_n": 429.0,
"calib/step_q_gap": 0.15790433250959574,
"calib/step_q_w": 0.3997180451127819,
"calib/step_q_w_n": 532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2340.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 532.91015625,
"completions/mean_terminated_length": 532.91015625,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.1344,
"grad_norm": 0.0251851137727499,
"kl": 0.1164093017578125,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0353,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.033219676464796066,
"mask/share_reasoning": 0.8798425197601318,
"mask/share_step_conf": 0.08693777024745941,
"num_tokens": 29757615.0,
"reward": 0.8525570034980774,
"reward_std": 0.25820255279541016,
"rewards/accuracy_reward_step": 0.484375,
"rewards/final_brier_reward_step": 0.6813836097717285,
"rewards/format_reward_step": 0.890625,
"rewards/step_l2_reward": 0.7487304210662842,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.7592334151268005,
"adv/mean_abs_reasoning": 0.6342440843582153,
"adv/mean_abs_step_conf": 0.7591673135757446,
"adv/ratio_final_to_reasoning": 1.1970681853423364,
"adv/ratio_step_to_reasoning": 1.1969639643449537,
"adv/std_final_conf": 0.9183486700057983,
"adv/std_reasoning": 0.8749744296073914,
"adv/std_step_conf": 0.9358720779418945,
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.7597771546635183,
"calib/avg_num_step_conf": 3.70703125,
"calib/ece": 0.18613733905579397,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.24034334763948498,
"calib/gap": 0.33792355371900823,
"calib/mean_conf": 0.4932618025751073,
"calib/mu_c": 0.66875,
"calib/mu_w": 0.3308264462809917,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 0.9453125,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.09935622317596567,
"calib/std_conf": 0.3843757607482917,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.5411187214611872,
"calib/step_q_c_n": 438.0,
"calib/step_q_gap": 0.13560013046314423,
"calib/step_q_w": 0.405518590998043,
"calib/step_q_w_n": 511.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2380.0,
"completions/max_terminated_length": 2380.0,
"completions/mean_length": 515.1875,
"completions/mean_terminated_length": 517.2078857421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.028686698526144028,
"kl": 0.1333465576171875,
"learning_rate": 2.027777777777778e-06,
"loss": -0.3105,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.033604905009269714,
"mask/share_reasoning": 0.8754376173019409,
"mask/share_step_conf": 0.0870511457324028,
"num_tokens": 29993175.0,
"reward": 0.8517680764198303,
"reward_std": 0.29256191849708557,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6856671571731567,
"rewards/format_reward_step": 0.88671875,
"rewards/step_l2_reward": 0.7530252933502197,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.780434787273407,
"adv/mean_abs_reasoning": 0.685494065284729,
"adv/mean_abs_step_conf": 0.7519875764846802,
"adv/ratio_final_to_reasoning": 1.1384996994091308,
"adv/ratio_step_to_reasoning": 1.0970008561231412,
"adv/std_final_conf": 0.9359096884727478,
"adv/std_reasoning": 0.8904690146446228,
"adv/std_step_conf": 0.9361345767974854,
"calib/answer_extract_rate": 0.8984375,
"calib/auroc": 0.7290708736324719,
"calib/avg_num_step_conf": 3.21875,
"calib/ece": 0.1712888888888889,
"calib/final_conf_rate": 0.87890625,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.2311111111111111,
"calib/gap": 0.29680910099889013,
"calib/mean_conf": 0.4968888888888889,
"calib/mu_c": 0.6538679245283018,
"calib/mu_w": 0.35705882352941165,
"calib/nonempty_final_conf_rate": 0.87890625,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.09853333333333336,
"calib/std_conf": 0.368311596476089,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_c": 0.5329842931937173,
"calib/step_q_c_n": 382.0,
"calib/step_q_gap": 0.09056347871407927,
"calib/step_q_w": 0.44242081447963805,
"calib/step_q_w_n": 442.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2394.0,
"completions/max_terminated_length": 2394.0,
"completions/mean_length": 554.72265625,
"completions/mean_terminated_length": 556.8980712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.03147805482149124,
"kl": 0.124420166015625,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.2282,
"mask/has_final_conf_rate": 0.87890625,
"mask/share_final_conf": 0.032010771334171295,
"mask/share_reasoning": 0.8855876922607422,
"mask/share_step_conf": 0.07849524170160294,
"num_tokens": 30241848.0,
"reward": 0.8135828971862793,
"reward_std": 0.3208565413951874,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.655642569065094,
"rewards/format_reward_step": 0.8515625,
"rewards/step_l2_reward": 0.7176169157028198,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.7757356762886047,
"adv/mean_abs_reasoning": 0.6284763813018799,
"adv/mean_abs_step_conf": 0.7494632005691528,
"adv/ratio_final_to_reasoning": 1.2343115817362609,
"adv/ratio_step_to_reasoning": 1.1925081401096578,
"adv/std_final_conf": 0.9355270862579346,
"adv/std_reasoning": 0.8591558337211609,
"adv/std_step_conf": 0.9356926083564758,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7336753731343283,
"calib/avg_num_step_conf": 3.76953125,
"calib/ece": 0.13034453781512606,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.25210084033613445,
"calib/gap": 0.3000838117106774,
"calib/mean_conf": 0.5786470588235293,
"calib/mu_c": 0.7097761194029851,
"calib/mu_w": 0.40969230769230774,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.07298319327731094,
"calib/std_conf": 0.3480800180446124,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.5702333931777379,
"calib/step_q_c_n": 557.0,
"calib/step_q_gap": 0.06636084415813015,
"calib/step_q_w": 0.5038725490196078,
"calib/step_q_w_n": 408.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1992.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 473.734375,
"completions/mean_terminated_length": 473.734375,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.1376,
"grad_norm": 0.03528955578804016,
"kl": 0.1468353271484375,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.1563,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.03455173224210739,
"mask/share_reasoning": 0.8706372976303101,
"mask/share_step_conf": 0.09481099247932434,
"num_tokens": 30465508.0,
"reward": 0.8781849145889282,
"reward_std": 0.27595558762550354,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7109373807907104,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l2_reward": 0.7579324841499329,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.7660256028175354,
"adv/mean_abs_reasoning": 0.5722837448120117,
"adv/mean_abs_step_conf": 0.7249982357025146,
"adv/ratio_final_to_reasoning": 1.3385416059111823,
"adv/ratio_step_to_reasoning": 1.2668510022780182,
"adv/std_final_conf": 0.9354217648506165,
"adv/std_reasoning": 0.8429265022277832,
"adv/std_step_conf": 0.9357032179832458,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6923210321864595,
"calib/avg_num_step_conf": 3.42578125,
"calib/ece": 0.20442148760330575,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.2603305785123967,
"calib/gap": 0.24038013318534956,
"calib/mean_conf": 0.5510330578512398,
"calib/mu_c": 0.6563235294117646,
"calib/mu_w": 0.4159433962264151,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.09673553719008264,
"calib/std_conf": 0.36934295992704985,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.5635294117647058,
"calib/step_q_c_n": 442.0,
"calib/step_q_gap": 0.10437998647734953,
"calib/step_q_w": 0.4591494252873563,
"calib/step_q_w_n": 435.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1408.0,
"completions/max_terminated_length": 1408.0,
"completions/mean_length": 449.43359375,
"completions/mean_terminated_length": 451.19610595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.03311045095324516,
"kl": 0.144561767578125,
"learning_rate": 1.944444444444445e-06,
"loss": -0.1809,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.034811727702617645,
"mask/share_reasoning": 0.875035285949707,
"mask/share_step_conf": 0.08624675869941711,
"num_tokens": 30685851.0,
"reward": 0.8714255094528198,
"reward_std": 0.2791735529899597,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6772284507751465,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": 0.7749974727630615,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.7358200550079346,
"adv/mean_abs_reasoning": 0.5314429998397827,
"adv/mean_abs_step_conf": 0.737650990486145,
"adv/ratio_final_to_reasoning": 1.3845700389877496,
"adv/ratio_step_to_reasoning": 1.3880152541449018,
"adv/std_final_conf": 0.9355961084365845,
"adv/std_reasoning": 0.7928544282913208,
"adv/std_step_conf": 0.9357744455337524,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.8268200897380228,
"calib/avg_num_step_conf": 3.73046875,
"calib/ece": 0.11046025104602511,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.24686192468619247,
"calib/gap": 0.4177652337530756,
"calib/mean_conf": 0.49271966527196653,
"calib/mu_c": 0.7391836734693877,
"calib/mu_w": 0.3214184397163121,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.09656903765690375,
"calib/std_conf": 0.37007028744111553,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_c": 0.5777211796246648,
"calib/step_q_c_n": 373.0,
"calib/step_q_gap": 0.15392392876555838,
"calib/step_q_w": 0.42379725085910647,
"calib/step_q_w_n": 582.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2146.0,
"completions/max_terminated_length": 2146.0,
"completions/mean_length": 502.39453125,
"completions/mean_terminated_length": 502.39453125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.028213733807206154,
"kl": 0.1276397705078125,
"learning_rate": 1.916666666666667e-06,
"loss": -0.168,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.031309232115745544,
"mask/share_reasoning": 0.8836838006973267,
"mask/share_step_conf": 0.08500701189041138,
"num_tokens": 30920672.0,
"reward": 0.8651392459869385,
"reward_std": 0.3005208969116211,
"rewards/accuracy_reward_step": 0.38671875,
"rewards/final_brier_reward_step": 0.7245796322822571,
"rewards/format_reward_step": 0.890625,
"rewards/step_l2_reward": 0.7502299547195435,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.716113805770874,
"adv/mean_abs_reasoning": 0.6242179870605469,
"adv/mean_abs_step_conf": 0.7061994671821594,
"adv/ratio_final_to_reasoning": 1.1472175115348184,
"adv/ratio_step_to_reasoning": 1.131334696886363,
"adv/std_final_conf": 0.9199298024177551,
"adv/std_reasoning": 0.8749078512191772,
"adv/std_step_conf": 0.9358008503913879,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7914211560044894,
"calib/avg_num_step_conf": 3.94140625,
"calib/ece": 0.14213991769547324,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.3991769547325103,
"calib/gap": 0.3619949494949495,
"calib/mean_conf": 0.6205761316872428,
"calib/mu_c": 0.7680555555555555,
"calib/mu_w": 0.406060606060606,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.08506172839506172,
"calib/std_conf": 0.3684698609691834,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.6124952015355086,
"calib/step_q_c_n": 521.0,
"calib/step_q_gap": 0.16530257858468894,
"calib/step_q_w": 0.4471926229508197,
"calib/step_q_w_n": 488.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2177.0,
"completions/max_terminated_length": 2177.0,
"completions/mean_length": 498.53125,
"completions/mean_terminated_length": 498.53125,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1408,
"grad_norm": 0.024374086409807205,
"kl": 0.1346588134765625,
"learning_rate": 1.888888888888889e-06,
"loss": -0.1078,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.034363698214292526,
"mask/share_reasoning": 0.8720892071723938,
"mask/share_step_conf": 0.09354706108570099,
"num_tokens": 31153888.0,
"reward": 0.9181256294250488,
"reward_std": 0.278538316488266,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7419136762619019,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7943376302719116,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7902538180351257,
"adv/mean_abs_reasoning": 0.6631218791007996,
"adv/mean_abs_step_conf": 0.743695855140686,
"adv/ratio_final_to_reasoning": 1.191717304074959,
"adv/ratio_step_to_reasoning": 1.1215070390214625,
"adv/std_final_conf": 0.9299471974372864,
"adv/std_reasoning": 0.8905849456787109,
"adv/std_step_conf": 0.9361699223518372,
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.7090786932056772,
"calib/avg_num_step_conf": 4.1484375,
"calib/ece": 0.16894736842105262,
"calib/final_conf_rate": 0.890625,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.19298245614035087,
"calib/gap": 0.26773998488284206,
"calib/mean_conf": 0.438859649122807,
"calib/mu_c": 0.6114814814814815,
"calib/mu_w": 0.34374149659863945,
"calib/nonempty_final_conf_rate": 0.890625,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.1262719298245614,
"calib/std_conf": 0.35228143347585994,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_c": 0.4963235294117647,
"calib/step_q_c_n": 340.0,
"calib/step_q_gap": 0.10362269838683402,
"calib/step_q_w": 0.3927008310249307,
"calib/step_q_w_n": 722.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2416.0,
"completions/max_terminated_length": 2416.0,
"completions/mean_length": 610.578125,
"completions/mean_terminated_length": 610.578125,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.03473455831408501,
"kl": 0.115020751953125,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.3156,
"mask/has_final_conf_rate": 0.890625,
"mask/share_final_conf": 0.02674572914838791,
"mask/share_reasoning": 0.8993955850601196,
"mask/share_step_conf": 0.07385867089033127,
"num_tokens": 31416540.0,
"reward": 0.8048692345619202,
"reward_std": 0.3397502303123474,
"rewards/accuracy_reward_step": 0.3203125,
"rewards/final_brier_reward_step": 0.6518843770027161,
"rewards/format_reward_step": 0.8515625,
"rewards/step_l2_reward": 0.723479151725769,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.8093454241752625,
"adv/mean_abs_reasoning": 0.658073902130127,
"adv/mean_abs_step_conf": 0.7692813873291016,
"adv/ratio_final_to_reasoning": 1.2298701126962839,
"adv/ratio_step_to_reasoning": 1.1689893564218332,
"adv/std_final_conf": 0.936396598815918,
"adv/std_reasoning": 0.8594855666160583,
"adv/std_step_conf": 0.9363592863082886,
"calib/answer_extract_rate": 0.875,
"calib/auroc": 0.7485759758753561,
"calib/avg_num_step_conf": 3.4296875,
"calib/ece": 0.16405731523378586,
"calib/final_conf_rate": 0.86328125,
"calib/format_rate": 0.83203125,
"calib/frac_conf_gt_0.9": 0.2171945701357466,
"calib/gap": 0.3283059697325068,
"calib/mean_conf": 0.4882503770739065,
"calib/mu_c": 0.6769148936170213,
"calib/mu_w": 0.3486089238845145,
"calib/nonempty_final_conf_rate": 0.86328125,
"calib/nonempty_reasoning_rate": 0.91796875,
"calib/nonempty_step_conf_rate": 0.87890625,
"calib/pce": 0.11348416289592761,
"calib/std_conf": 0.36948397379862874,
"calib/step_conf_rate": 0.87890625,
"calib/step_q_c": 0.5640285714285714,
"calib/step_q_c_n": 350.0,
"calib/step_q_gap": 0.14052478354978354,
"calib/step_q_w": 0.4235037878787879,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2195.0,
"completions/max_terminated_length": 2195.0,
"completions/mean_length": 588.9921875,
"completions/mean_terminated_length": 591.302001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.02548646554350853,
"kl": 0.1108551025390625,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.2445,
"mask/has_final_conf_rate": 0.86328125,
"mask/share_final_conf": 0.02605942077934742,
"mask/share_reasoning": 0.9032557010650635,
"mask/share_step_conf": 0.06677865236997604,
"num_tokens": 31676274.0,
"reward": 0.7943482398986816,
"reward_std": 0.3597278892993927,
"rewards/accuracy_reward_step": 0.37109375,
"rewards/final_brier_reward_step": 0.6471452713012695,
"rewards/format_reward_step": 0.83203125,
"rewards/step_l2_reward": 0.7009261846542358,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.8014721870422363,
"adv/mean_abs_reasoning": 0.6012312173843384,
"adv/mean_abs_step_conf": 0.7397862076759338,
"adv/ratio_final_to_reasoning": 1.33305151806496,
"adv/ratio_step_to_reasoning": 1.2304520894546696,
"adv/std_final_conf": 0.9359004497528076,
"adv/std_reasoning": 0.8432109951972961,
"adv/std_step_conf": 0.936022937297821,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6532982456140352,
"calib/avg_num_step_conf": 3.859375,
"calib/ece": 0.21463012552301258,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.27615062761506276,
"calib/gap": 0.208172098245614,
"calib/mean_conf": 0.5188426778242677,
"calib/mu_c": 0.627719298245614,
"calib/mu_w": 0.4195472,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.12824267782426782,
"calib/std_conf": 0.3687955259163612,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.543871921182266,
"calib/step_q_c_n": 406.0,
"calib/step_q_gap": 0.10062449850185362,
"calib/step_q_w": 0.4432474226804124,
"calib/step_q_w_n": 582.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2138.0,
"completions/max_terminated_length": 2138.0,
"completions/mean_length": 559.4375,
"completions/mean_terminated_length": 559.4375,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.144,
"grad_norm": 0.04485005512833595,
"kl": 0.1193695068359375,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.0281,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.0306907556951046,
"mask/share_reasoning": 0.8863241672515869,
"mask/share_step_conf": 0.082985058426857,
"num_tokens": 31925370.0,
"reward": 0.835256814956665,
"reward_std": 0.29443052411079407,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.6450504660606384,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l2_reward": 0.7567132711410522,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.7456756830215454,
"adv/mean_abs_reasoning": 0.591724157333374,
"adv/mean_abs_step_conf": 0.7272220849990845,
"adv/ratio_final_to_reasoning": 1.2601744812683657,
"adv/ratio_step_to_reasoning": 1.2289883317867851,
"adv/std_final_conf": 0.9112811088562012,
"adv/std_reasoning": 0.8431320190429688,
"adv/std_step_conf": 0.9358429312705994,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.8196135066364074,
"calib/avg_num_step_conf": 4.52734375,
"calib/ece": 0.135206611570248,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.2727272727272727,
"calib/gap": 0.4178254590468332,
"calib/mean_conf": 0.4905785123966942,
"calib/mu_c": 0.7167567567567569,
"calib/mu_w": 0.29893129770992366,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.08355371900826451,
"calib/std_conf": 0.3780961387224092,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5441613588110402,
"calib/step_q_c_n": 471.0,
"calib/step_q_gap": 0.16553913658881803,
"calib/step_q_w": 0.3786222222222222,
"calib/step_q_w_n": 675.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2723.0,
"completions/max_terminated_length": 2723.0,
"completions/mean_length": 527.8125,
"completions/mean_terminated_length": 527.8125,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.024483520537614822,
"kl": 0.127044677734375,
"learning_rate": 1.777777777777778e-06,
"loss": -0.1758,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.033305633813142776,
"mask/share_reasoning": 0.8706825971603394,
"mask/share_step_conf": 0.09601178765296936,
"num_tokens": 32168978.0,
"reward": 0.8965328931808472,
"reward_std": 0.2803770899772644,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.7379539012908936,
"rewards/format_reward_step": 0.90625,
"rewards/step_l2_reward": 0.7863619327545166,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.7307909727096558,
"adv/mean_abs_reasoning": 0.5491311550140381,
"adv/mean_abs_step_conf": 0.7555731534957886,
"adv/ratio_final_to_reasoning": 1.3308131691981193,
"adv/ratio_step_to_reasoning": 1.375942972087375,
"adv/std_final_conf": 0.907728374004364,
"adv/std_reasoning": 0.8099531531333923,
"adv/std_step_conf": 0.9357438683509827,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7726775956284153,
"calib/avg_num_step_conf": 4.1015625,
"calib/ece": 0.16673512396694207,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.3512396694214876,
"calib/gap": 0.34956230874316946,
"calib/mean_conf": 0.5520252066115703,
"calib/mu_c": 0.7282508333333334,
"calib/mu_w": 0.37868852459016394,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.11144628099173544,
"calib/std_conf": 0.3776974578334926,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.5540661478599223,
"calib/step_q_c_n": 514.0,
"calib/step_q_gap": 0.14429002845693722,
"calib/step_q_w": 0.40977611940298503,
"calib/step_q_w_n": 536.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1913.0,
"completions/max_terminated_length": 1913.0,
"completions/mean_length": 514.43359375,
"completions/mean_terminated_length": 516.4509887695312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.04350460320711136,
"kl": 0.1253662109375,
"learning_rate": 1.75e-06,
"loss": -0.1918,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.030667105689644814,
"mask/share_reasoning": 0.8750026822090149,
"mask/share_step_conf": 0.09042397141456604,
"num_tokens": 32407657.0,
"reward": 0.8747584223747253,
"reward_std": 0.2750248908996582,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7067804336547852,
"rewards/format_reward_step": 0.90625,
"rewards/step_l2_reward": 0.766173779964447,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.7335097789764404,
"adv/mean_abs_reasoning": 0.662164568901062,
"adv/mean_abs_step_conf": 0.7231161594390869,
"adv/ratio_final_to_reasoning": 1.1077454358420051,
"adv/ratio_step_to_reasoning": 1.0920490062450503,
"adv/std_final_conf": 0.9345899224281311,
"adv/std_reasoning": 0.8592326045036316,
"adv/std_step_conf": 0.9355090856552124,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.7949413808076423,
"calib/avg_num_step_conf": 3.98046875,
"calib/ece": 0.14609958506224066,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.2946058091286307,
"calib/gap": 0.38248588797221017,
"calib/mean_conf": 0.554896265560166,
"calib/mu_c": 0.7040816326530612,
"calib/mu_w": 0.32159574468085106,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.045518672199170135,
"calib/std_conf": 0.36411902667632423,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5553217391304349,
"calib/step_q_c_n": 575.0,
"calib/step_q_gap": 0.1506370544457502,
"calib/step_q_w": 0.4046846846846847,
"calib/step_q_w_n": 444.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1401.0,
"completions/max_terminated_length": 1401.0,
"completions/mean_length": 499.91015625,
"completions/mean_terminated_length": 501.87060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.1472,
"grad_norm": 0.0327833816409111,
"kl": 0.13421630859375,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.1421,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.03305238112807274,
"mask/share_reasoning": 0.8726654052734375,
"mask/share_step_conf": 0.09037593007087708,
"num_tokens": 32639970.0,
"reward": 0.915382444858551,
"reward_std": 0.26028817892074585,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7430488467216492,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": 0.7877160906791687,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.7603707313537598,
"adv/mean_abs_reasoning": 0.6023674011230469,
"adv/mean_abs_step_conf": 0.7213539481163025,
"adv/ratio_final_to_reasoning": 1.2623039193955936,
"adv/ratio_step_to_reasoning": 1.197531517760454,
"adv/std_final_conf": 0.924269437789917,
"adv/std_reasoning": 0.843031644821167,
"adv/std_step_conf": 0.9355605244636536,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7862875328628753,
"calib/avg_num_step_conf": 3.94921875,
"calib/ece": 0.12224489795918367,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.2653061224489796,
"calib/gap": 0.35155597066555966,
"calib/mean_conf": 0.5563673469387755,
"calib/mu_c": 0.6984246575342465,
"calib/mu_w": 0.3468686868686869,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.04134693877551021,
"calib/std_conf": 0.35494907331624814,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5624642857142857,
"calib/step_q_c_n": 560.0,
"calib/step_q_gap": 0.12816273360785557,
"calib/step_q_w": 0.4343015521064302,
"calib/step_q_w_n": 451.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1795.0,
"completions/max_terminated_length": 1795.0,
"completions/mean_length": 460.72265625,
"completions/mean_terminated_length": 460.72265625,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.04797978699207306,
"kl": 0.1426239013671875,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.1653,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.0341273695230484,
"mask/share_reasoning": 0.8714209794998169,
"mask/share_step_conf": 0.09445163607597351,
"num_tokens": 32861011.0,
"reward": 0.9247560501098633,
"reward_std": 0.2628259062767029,
"rewards/accuracy_reward_step": 0.578125,
"rewards/final_brier_reward_step": 0.7417035102844238,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.8062459826469421,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.783392071723938,
"adv/mean_abs_reasoning": 0.5787963271141052,
"adv/mean_abs_step_conf": 0.761437177658081,
"adv/ratio_final_to_reasoning": 1.3534848702823545,
"adv/ratio_step_to_reasoning": 1.3155528844742816,
"adv/std_final_conf": 0.9139004349708557,
"adv/std_reasoning": 0.7930855751037598,
"adv/std_step_conf": 0.9354227781295776,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7346195358877496,
"calib/avg_num_step_conf": 4.3515625,
"calib/ece": 0.1804897959183673,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.42448979591836733,
"calib/gap": 0.3148880194279548,
"calib/mean_conf": 0.6738775510204082,
"calib/mu_c": 0.8139705882352942,
"calib/mu_w": 0.4990825688073394,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.14963265306122445,
"calib/std_conf": 0.3511556217982749,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5867399334442596,
"calib/step_q_c_n": 601.0,
"calib/step_q_gap": 0.14962492369767083,
"calib/step_q_w": 0.43711500974658873,
"calib/step_q_w_n": 513.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1658.0,
"completions/max_terminated_length": 1658.0,
"completions/mean_length": 497.359375,
"completions/mean_terminated_length": 497.359375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.02688215859234333,
"kl": 0.154388427734375,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.111,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03305169939994812,
"mask/share_reasoning": 0.8668557405471802,
"mask/share_step_conf": 0.1000925675034523,
"num_tokens": 33093351.0,
"reward": 0.9080557227134705,
"reward_std": 0.26558494567871094,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7214667797088623,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.800113320350647,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.7365533113479614,
"adv/mean_abs_reasoning": 0.5491877794265747,
"adv/mean_abs_step_conf": 0.7725829482078552,
"adv/ratio_final_to_reasoning": 1.341168428978921,
"adv/ratio_step_to_reasoning": 1.406773743244132,
"adv/std_final_conf": 0.9102948307991028,
"adv/std_reasoning": 0.7931224703788757,
"adv/std_step_conf": 0.9355855584144592,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.8940781681800237,
"calib/avg_num_step_conf": 3.8125,
"calib/ece": 0.07226495726495723,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.42735042735042733,
"calib/gap": 0.547531780497434,
"calib/mean_conf": 0.6397008547008547,
"calib/mu_c": 0.8385906040268457,
"calib/mu_w": 0.2910588235294117,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.03760683760683757,
"calib/std_conf": 0.3746809415329093,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.6011498257839721,
"calib/step_q_c_n": 574.0,
"calib/step_q_gap": 0.20943340787352444,
"calib/step_q_w": 0.3917164179104477,
"calib/step_q_w_n": 402.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1963.0,
"completions/max_terminated_length": 1963.0,
"completions/mean_length": 550.11328125,
"completions/mean_terminated_length": 552.2706298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.1504,
"grad_norm": 0.03140458092093468,
"kl": 0.117950439453125,
"learning_rate": 1.638888888888889e-06,
"loss": -0.2438,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.02958410233259201,
"mask/share_reasoning": 0.8877764940261841,
"mask/share_step_conf": 0.0787331610918045,
"num_tokens": 33341276.0,
"reward": 0.9270013570785522,
"reward_std": 0.29023104906082153,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7821683883666992,
"rewards/format_reward_step": 0.88671875,
"rewards/step_l2_reward": 0.7773030996322632,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.7450395226478577,
"adv/mean_abs_reasoning": 0.6056081056594849,
"adv/mean_abs_step_conf": 0.7474773526191711,
"adv/ratio_final_to_reasoning": 1.2302337364466698,
"adv/ratio_step_to_reasoning": 1.2342591613848959,
"adv/std_final_conf": 0.9208177328109741,
"adv/std_reasoning": 0.8590512275695801,
"adv/std_step_conf": 0.9352788329124451,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7868398935425129,
"calib/avg_num_step_conf": 4.1484375,
"calib/ece": 0.14184100418410048,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.3305439330543933,
"calib/gap": 0.38385558201428766,
"calib/mean_conf": 0.5588284518828452,
"calib/mu_c": 0.7483471074380165,
"calib/mu_w": 0.3644915254237288,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.09719665271966534,
"calib/std_conf": 0.38254855554489575,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.5839204545454546,
"calib/step_q_c_n": 528.0,
"calib/step_q_gap": 0.15360210248552947,
"calib/step_q_w": 0.43031835205992514,
"calib/step_q_w_n": 534.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2318.0,
"completions/max_terminated_length": 2318.0,
"completions/mean_length": 580.203125,
"completions/mean_terminated_length": 582.4784545898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.02812611497938633,
"kl": 0.115814208984375,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.1799,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.02923966571688652,
"mask/share_reasoning": 0.8787957429885864,
"mask/share_step_conf": 0.08805837482213974,
"num_tokens": 33594968.0,
"reward": 0.8962991237640381,
"reward_std": 0.2742632031440735,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7293863296508789,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": 0.7850868701934814,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.7679699063301086,
"adv/mean_abs_reasoning": 0.570601224899292,
"adv/mean_abs_step_conf": 0.757947564125061,
"adv/ratio_final_to_reasoning": 1.3458960002507026,
"adv/ratio_step_to_reasoning": 1.328331470474559,
"adv/std_final_conf": 0.9319179058074951,
"adv/std_reasoning": 0.8098957538604736,
"adv/std_step_conf": 0.9355419874191284,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7768035210783304,
"calib/avg_num_step_conf": 4.66015625,
"calib/ece": 0.15472451790633607,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.3512396694214876,
"calib/gap": 0.34518189945670874,
"calib/mean_conf": 0.5920798898071625,
"calib/mu_c": 0.7504071246819339,
"calib/mu_w": 0.4052252252252252,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.10274104683195591,
"calib/std_conf": 0.3625200530213408,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5489919237147595,
"calib/step_q_c_n": 603.0,
"calib/step_q_gap": 0.13873768642662387,
"calib/step_q_w": 0.4102542372881356,
"calib/step_q_w_n": 590.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3050.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 565.94140625,
"completions/mean_terminated_length": 568.1608276367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.0339367501437664,
"kl": 0.116668701171875,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.1496,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.029600488021969795,
"mask/share_reasoning": 0.8680211305618286,
"mask/share_step_conf": 0.09847214818000793,
"num_tokens": 33847185.0,
"reward": 0.8974969387054443,
"reward_std": 0.24187132716178894,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7286787033081055,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.7780337333679199,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.7534208297729492,
"adv/mean_abs_reasoning": 0.567805290222168,
"adv/mean_abs_step_conf": 0.754891037940979,
"adv/ratio_final_to_reasoning": 1.3268999827003276,
"adv/ratio_step_to_reasoning": 1.3294892649654761,
"adv/std_final_conf": 0.9313053488731384,
"adv/std_reasoning": 0.8100166320800781,
"adv/std_step_conf": 0.9351264238357544,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7182993197278912,
"calib/avg_num_step_conf": 4.96875,
"calib/ece": 0.18710204081632648,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.4204081632653061,
"calib/gap": 0.32078571428571434,
"calib/mean_conf": 0.6030204081632653,
"calib/mu_c": 0.7405,
"calib/mu_w": 0.4197142857142857,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.10934693877551015,
"calib/std_conf": 0.38181181868772646,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5480421686746988,
"calib/step_q_c_n": 664.0,
"calib/step_q_gap": 0.12251585288522504,
"calib/step_q_w": 0.42552631578947375,
"calib/step_q_w_n": 608.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2328.0,
"completions/max_terminated_length": 2328.0,
"completions/mean_length": 558.953125,
"completions/mean_terminated_length": 558.953125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.1536,
"grad_norm": 0.04941343888640404,
"kl": 0.14173126220703125,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0236,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.031576041132211685,
"mask/share_reasoning": 0.8657428026199341,
"mask/share_step_conf": 0.10268114507198334,
"num_tokens": 34094405.0,
"reward": 0.903618335723877,
"reward_std": 0.22886237502098083,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7181081771850586,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.7914721965789795,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.752514123916626,
"adv/mean_abs_reasoning": 0.6453818082809448,
"adv/mean_abs_step_conf": 0.7151585817337036,
"adv/ratio_final_to_reasoning": 1.1659983505284128,
"adv/ratio_step_to_reasoning": 1.1081170441395272,
"adv/std_final_conf": 0.917668879032135,
"adv/std_reasoning": 0.8748682737350464,
"adv/std_step_conf": 0.9354785680770874,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7588111888111888,
"calib/avg_num_step_conf": 4.94921875,
"calib/ece": 0.1354732510288065,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.29730209790209794,
"calib/mean_conf": 0.6355555555555555,
"calib/mu_c": 0.757902097902098,
"calib/mu_w": 0.4606,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.091275720164609,
"calib/std_conf": 0.33938457979719,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5594109195402299,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.12373316122149086,
"calib/step_q_w": 0.43567775831873906,
"calib/step_q_w_n": 571.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1692.0,
"completions/max_terminated_length": 1692.0,
"completions/mean_length": 513.16015625,
"completions/mean_terminated_length": 515.172607421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.02969875931739807,
"kl": 0.123565673828125,
"learning_rate": 1.527777777777778e-06,
"loss": -0.1137,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03126313537359238,
"mask/share_reasoning": 0.8572871685028076,
"mask/share_step_conf": 0.1075434759259224,
"num_tokens": 34328478.0,
"reward": 0.908592164516449,
"reward_std": 0.2451344132423401,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7336207032203674,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7843448519706726,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.7495306134223938,
"adv/mean_abs_reasoning": 0.5869273543357849,
"adv/mean_abs_step_conf": 0.7415024042129517,
"adv/ratio_final_to_reasoning": 1.2770415416582928,
"adv/ratio_step_to_reasoning": 1.2633631721801355,
"adv/std_final_conf": 0.9171761870384216,
"adv/std_reasoning": 0.8100182414054871,
"adv/std_step_conf": 0.9355935454368591,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7831020048468825,
"calib/avg_num_step_conf": 4.828125,
"calib/ece": 0.24512396694214883,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.36363636363636365,
"calib/gap": 0.36282587941543665,
"calib/mean_conf": 0.6061157024793389,
"calib/mu_c": 0.8355056179775281,
"calib/mu_w": 0.47267973856209144,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.24173553719008273,
"calib/std_conf": 0.365805737861786,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5575,
"calib/step_q_c_n": 464.0,
"calib/step_q_gap": 0.12269430051813468,
"calib/step_q_w": 0.4348056994818653,
"calib/step_q_w_n": 772.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2151.0,
"completions/max_terminated_length": 2151.0,
"completions/mean_length": 552.859375,
"completions/mean_terminated_length": 552.859375,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.04507608339190483,
"kl": 0.1235198974609375,
"learning_rate": 1.5e-06,
"loss": -0.0665,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.030464837327599525,
"mask/share_reasoning": 0.8707108497619629,
"mask/share_step_conf": 0.09882433712482452,
"num_tokens": 34577226.0,
"reward": 0.855940043926239,
"reward_std": 0.29292845726013184,
"rewards/accuracy_reward_step": 0.3515625,
"rewards/final_brier_reward_step": 0.6881687641143799,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7690237760543823,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.7158942222595215,
"adv/mean_abs_reasoning": 0.5292023420333862,
"adv/mean_abs_step_conf": 0.7680728435516357,
"adv/ratio_final_to_reasoning": 1.3527797694711587,
"adv/ratio_step_to_reasoning": 1.451378390731271,
"adv/std_final_conf": 0.8983597159385681,
"adv/std_reasoning": 0.7927306890487671,
"adv/std_step_conf": 0.93496173620224,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7852531430513082,
"calib/avg_num_step_conf": 5.1640625,
"calib/ece": 0.20327868852459016,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.430327868852459,
"calib/gap": 0.36922324159021414,
"calib/mean_conf": 0.6336065573770493,
"calib/mu_c": 0.8378899082568808,
"calib/mu_w": 0.4686666666666667,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.19508196721311477,
"calib/std_conf": 0.37252256081700147,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.6004770318021202,
"calib/step_q_c_n": 566.0,
"calib/step_q_gap": 0.17474951857460697,
"calib/step_q_w": 0.42572751322751323,
"calib/step_q_w_n": 756.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2424.0,
"completions/max_terminated_length": 2424.0,
"completions/mean_length": 578.70703125,
"completions/mean_terminated_length": 580.9765014648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.1568,
"grad_norm": 0.028052711859345436,
"kl": 0.11347198486328125,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.0631,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.02955162152647972,
"mask/share_reasoning": 0.8645089864730835,
"mask/share_step_conf": 0.10203312337398529,
"num_tokens": 34829055.0,
"reward": 0.8883394002914429,
"reward_std": 0.24087491631507874,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.7088976502418518,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.795124888420105,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.7483645081520081,
"adv/mean_abs_reasoning": 0.632530927658081,
"adv/mean_abs_step_conf": 0.765121340751648,
"adv/ratio_final_to_reasoning": 1.1831271411862752,
"adv/ratio_step_to_reasoning": 1.2096188617756245,
"adv/std_final_conf": 0.9116754531860352,
"adv/std_reasoning": 0.8267991542816162,
"adv/std_step_conf": 0.9356098771095276,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7764132014114716,
"calib/avg_num_step_conf": 4.5078125,
"calib/ece": 0.16439024390243906,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.4959349593495935,
"calib/gap": 0.34411125717844054,
"calib/mean_conf": 0.706260162601626,
"calib/mu_c": 0.8419463087248322,
"calib/mu_w": 0.4978350515463917,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.13247967479674802,
"calib/std_conf": 0.352635601258254,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5817985611510792,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.13275716681556726,
"calib/step_q_w": 0.44904139433551193,
"calib/step_q_w_n": 459.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2280.0,
"completions/max_terminated_length": 2280.0,
"completions/mean_length": 517.2265625,
"completions/mean_terminated_length": 519.2549438476562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.030412495136260986,
"kl": 0.1313934326171875,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.1081,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03274453431367874,
"mask/share_reasoning": 0.8602826595306396,
"mask/share_step_conf": 0.10306654870510101,
"num_tokens": 35066577.0,
"reward": 0.907225489616394,
"reward_std": 0.2797033190727234,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7349027395248413,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7772043943405151,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.7614589929580688,
"adv/mean_abs_reasoning": 0.6402909755706787,
"adv/mean_abs_step_conf": 0.7309304475784302,
"adv/ratio_final_to_reasoning": 1.1892389897880342,
"adv/ratio_step_to_reasoning": 1.1415598149372108,
"adv/std_final_conf": 0.9285000562667847,
"adv/std_reasoning": 0.859099268913269,
"adv/std_step_conf": 0.9354667067527771,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.8360772357723576,
"calib/avg_num_step_conf": 5.36328125,
"calib/ece": 0.16386831275720157,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.3991769547325103,
"calib/gap": 0.4133841463414635,
"calib/mean_conf": 0.6461728395061728,
"calib/mu_c": 0.8554166666666667,
"calib/mu_w": 0.44203252032520324,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1581069958847736,
"calib/std_conf": 0.35694947101657876,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5887889273356403,
"calib/step_q_c_n": 578.0,
"calib/step_q_gap": 0.1774052795368981,
"calib/step_q_w": 0.41138364779874215,
"calib/step_q_w_n": 795.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2441.0,
"completions/max_terminated_length": 2441.0,
"completions/mean_length": 613.9609375,
"completions/mean_terminated_length": 613.9609375,
"completions/min_length": 94.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.024701019749045372,
"kl": 0.1152801513671875,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0094,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.027972478419542313,
"mask/share_reasoning": 0.8716949224472046,
"mask/share_step_conf": 0.10033257305622101,
"num_tokens": 35328207.0,
"reward": 0.9243149161338806,
"reward_std": 0.2653235197067261,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7544609308242798,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8129189014434814,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.7447877526283264,
"adv/mean_abs_reasoning": 0.5853399038314819,
"adv/mean_abs_step_conf": 0.7393544316291809,
"adv/ratio_final_to_reasoning": 1.272402150875313,
"adv/ratio_step_to_reasoning": 1.2631198160070074,
"adv/std_final_conf": 0.9059438705444336,
"adv/std_reasoning": 0.8266521692276001,
"adv/std_step_conf": 0.9355303645133972,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7687435098650052,
"calib/avg_num_step_conf": 4.73046875,
"calib/ece": 0.24756198347107436,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5909090909090909,
"calib/gap": 0.27484735202492194,
"calib/mean_conf": 0.7878099173553718,
"calib/mu_c": 0.9093333333333332,
"calib/mu_w": 0.6344859813084113,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.23876033057851237,
"calib/std_conf": 0.3046861225754724,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.60217503900156,
"calib/step_q_c_n": 641.0,
"calib/step_q_gap": 0.11470135479103366,
"calib/step_q_w": 0.48747368421052634,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2015.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 486.609375,
"completions/mean_terminated_length": 486.609375,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.16,
"grad_norm": 0.0389578640460968,
"kl": 0.1333160400390625,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0673,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03438667953014374,
"mask/share_reasoning": 0.8491482734680176,
"mask/share_step_conf": 0.11646504700183868,
"num_tokens": 35557739.0,
"reward": 0.889436662197113,
"reward_std": 0.26576003432273865,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.69478440284729,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.791120171546936,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.7764695882797241,
"adv/mean_abs_reasoning": 0.6994028091430664,
"adv/mean_abs_step_conf": 0.7512112855911255,
"adv/ratio_final_to_reasoning": 1.110189404630906,
"adv/ratio_step_to_reasoning": 1.0740753050613805,
"adv/std_final_conf": 0.9359955787658691,
"adv/std_reasoning": 0.8903999924659729,
"adv/std_step_conf": 0.9358260035514832,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.7411612193588937,
"calib/avg_num_step_conf": 4.6015625,
"calib/ece": 0.25863247863247874,
"calib/final_conf_rate": 0.9140625,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.3888888888888889,
"calib/gap": 0.3447061596480202,
"calib/mean_conf": 0.6183760683760683,
"calib/mu_c": 0.8363953488372093,
"calib/mu_w": 0.49168918918918914,
"calib/nonempty_final_conf_rate": 0.9140625,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.2547435897435898,
"calib/std_conf": 0.3808581342201271,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5588489208633094,
"calib/step_q_c_n": 417.0,
"calib/step_q_gap": 0.13473591166488624,
"calib/step_q_w": 0.42411300919842315,
"calib/step_q_w_n": 761.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2264.0,
"completions/max_terminated_length": 2264.0,
"completions/mean_length": 573.1015625,
"completions/mean_terminated_length": 575.3490600585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.03305526822805405,
"kl": 0.1082305908203125,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.2121,
"mask/has_final_conf_rate": 0.9140625,
"mask/share_final_conf": 0.027798429131507874,
"mask/share_reasoning": 0.8735820055007935,
"mask/share_step_conf": 0.09471327811479568,
"num_tokens": 35811477.0,
"reward": 0.8235229849815369,
"reward_std": 0.28725647926330566,
"rewards/accuracy_reward_step": 0.33984375,
"rewards/final_brier_reward_step": 0.6552902460098267,
"rewards/format_reward_step": 0.90234375,
"rewards/step_l2_reward": 0.7433182001113892,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.8092677593231201,
"adv/mean_abs_reasoning": 0.6949714422225952,
"adv/mean_abs_step_conf": 0.7740131616592407,
"adv/ratio_final_to_reasoning": 1.164461890311626,
"adv/ratio_step_to_reasoning": 1.1137337660722595,
"adv/std_final_conf": 0.9358307123184204,
"adv/std_reasoning": 0.8591673374176025,
"adv/std_step_conf": 0.9356124401092529,
"calib/answer_extract_rate": 0.92578125,
"calib/auroc": 0.6775272727272726,
"calib/avg_num_step_conf": 4.77734375,
"calib/ece": 0.23902127659574468,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.39148936170212767,
"calib/gap": 0.25546181818181823,
"calib/mean_conf": 0.6322978723404256,
"calib/mu_c": 0.7681818181818182,
"calib/mu_w": 0.51272,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.20161702127659575,
"calib/std_conf": 0.3734107267427342,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5278633975481611,
"calib/step_q_c_n": 571.0,
"calib/step_q_gap": 0.10728057546227154,
"calib/step_q_w": 0.4205828220858896,
"calib/step_q_w_n": 652.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2192.0,
"completions/max_terminated_length": 2192.0,
"completions/mean_length": 567.6328125,
"completions/mean_terminated_length": 567.6328125,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.03677584230899811,
"kl": 0.1179046630859375,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.1297,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.02956000715494156,
"mask/share_reasoning": 0.86711585521698,
"mask/share_step_conf": 0.10332408547401428,
"num_tokens": 36062183.0,
"reward": 0.8294390439987183,
"reward_std": 0.3084717392921448,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.6317847967147827,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l2_reward": 0.7606871128082275,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.7374790906906128,
"adv/mean_abs_reasoning": 0.5551480650901794,
"adv/mean_abs_step_conf": 0.7407870292663574,
"adv/ratio_final_to_reasoning": 1.3284367487992867,
"adv/ratio_step_to_reasoning": 1.3343954088104808,
"adv/std_final_conf": 0.9241796731948853,
"adv/std_reasoning": 0.826519787311554,
"adv/std_step_conf": 0.9351882338523865,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.719258064516129,
"calib/avg_num_step_conf": 4.8203125,
"calib/ece": 0.20381526104417674,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.357429718875502,
"calib/gap": 0.28110064516129024,
"calib/mean_conf": 0.5935341365461848,
"calib/mu_c": 0.73352,
"calib/mu_w": 0.4524193548387097,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.14767068273092374,
"calib/std_conf": 0.37711699610913185,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5223010380622837,
"calib/step_q_c_n": 578.0,
"calib/step_q_gap": 0.14027359903789338,
"calib/step_q_w": 0.3820274390243903,
"calib/step_q_w_n": 656.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1892.0,
"completions/max_terminated_length": 1892.0,
"completions/mean_length": 541.921875,
"completions/mean_terminated_length": 541.921875,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.1632,
"grad_norm": 0.03512969985604286,
"kl": 0.1247406005859375,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0449,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0296354778110981,
"mask/share_reasoning": 0.8736000657081604,
"mask/share_step_conf": 0.09676443040370941,
"num_tokens": 36308235.0,
"reward": 0.8974446654319763,
"reward_std": 0.22768069803714752,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.7062948942184448,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8010944128036499,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.7104947566986084,
"adv/mean_abs_reasoning": 0.5184451937675476,
"adv/mean_abs_step_conf": 0.7680416107177734,
"adv/ratio_final_to_reasoning": 1.3704336837138642,
"adv/ratio_step_to_reasoning": 1.4814325987601613,
"adv/std_final_conf": 0.9080306887626648,
"adv/std_reasoning": 0.7754558324813843,
"adv/std_step_conf": 0.9346005916595459,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.7593478260869566,
"calib/avg_num_step_conf": 4.74609375,
"calib/ece": 0.2056862745098039,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4117647058823529,
"calib/gap": 0.36628260869565216,
"calib/mean_conf": 0.609686274509804,
"calib/mu_c": 0.8107826086956521,
"calib/mu_w": 0.44449999999999995,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.18219607843137253,
"calib/std_conf": 0.38449240744160135,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5532495164410057,
"calib/step_q_c_n": 517.0,
"calib/step_q_gap": 0.13836412962152145,
"calib/step_q_w": 0.4148853868194843,
"calib/step_q_w_n": 698.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1379.0,
"completions/max_terminated_length": 1379.0,
"completions/mean_length": 518.19140625,
"completions/mean_terminated_length": 520.2235717773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.03980162367224693,
"kl": 0.1353759765625,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0146,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.031303271651268005,
"mask/share_reasoning": 0.8627222180366516,
"mask/share_step_conf": 0.10206824541091919,
"num_tokens": 36545332.0,
"reward": 0.9230555295944214,
"reward_std": 0.19609469175338745,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7388566136360168,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8236606121063232,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.7740155458450317,
"adv/mean_abs_reasoning": 0.6197652220726013,
"adv/mean_abs_step_conf": 0.7698578238487244,
"adv/ratio_final_to_reasoning": 1.248885091126267,
"adv/ratio_step_to_reasoning": 1.2421765475548752,
"adv/std_final_conf": 0.936392068862915,
"adv/std_reasoning": 0.826673686504364,
"adv/std_step_conf": 0.9354244470596313,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6865111022470416,
"calib/avg_num_step_conf": 4.83984375,
"calib/ece": 0.20919028340080975,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.22672064777327935,
"calib/gap": 0.25856069671586235,
"calib/mean_conf": 0.48489878542510123,
"calib/mu_c": 0.6293577981651377,
"calib/mu_w": 0.37079710144927536,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.12639676113360326,
"calib/std_conf": 0.3776904697899919,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.4596691176470588,
"calib/step_q_c_n": 544.0,
"calib/step_q_gap": 0.061090700380871776,
"calib/step_q_w": 0.39857841726618704,
"calib/step_q_w_n": 695.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1725.0,
"completions/max_terminated_length": 1725.0,
"completions/mean_length": 509.3359375,
"completions/mean_terminated_length": 509.3359375,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.046036504209041595,
"kl": 0.1375579833984375,
"learning_rate": 1.25e-06,
"loss": -0.0607,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03247381001710892,
"mask/share_reasoning": 0.8612602949142456,
"mask/share_step_conf": 0.10626588761806488,
"num_tokens": 36782938.0,
"reward": 0.879797637462616,
"reward_std": 0.24189157783985138,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.6951472759246826,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7894479632377625,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.7444911003112793,
"adv/mean_abs_reasoning": 0.5075423717498779,
"adv/mean_abs_step_conf": 0.7232406139373779,
"adv/ratio_final_to_reasoning": 1.4668550681679284,
"adv/ratio_step_to_reasoning": 1.4249856843357274,
"adv/std_final_conf": 0.9285725355148315,
"adv/std_reasoning": 0.7755016088485718,
"adv/std_step_conf": 0.9355357885360718,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7278023302531136,
"calib/avg_num_step_conf": 5.05078125,
"calib/ece": 0.24865306122448982,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.42857142857142855,
"calib/gap": 0.29628230882549883,
"calib/mean_conf": 0.6317551020408164,
"calib/mu_c": 0.7901754385964912,
"calib/mu_w": 0.49389312977099237,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.2075510204081633,
"calib/std_conf": 0.3830433731828229,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.504851294498382,
"calib/step_q_c_n": 618.0,
"calib/step_q_gap": 0.09015499820208561,
"calib/step_q_w": 0.41469629629629634,
"calib/step_q_w_n": 675.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2218.0,
"completions/max_terminated_length": 2218.0,
"completions/mean_length": 526.1875,
"completions/mean_terminated_length": 528.2510375976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.1664,
"grad_norm": 0.04315432161092758,
"kl": 0.126495361328125,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0911,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.031666506081819534,
"mask/share_reasoning": 0.8552207946777344,
"mask/share_step_conf": 0.1092064157128334,
"num_tokens": 37022402.0,
"reward": 0.8750288486480713,
"reward_std": 0.24085386097431183,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.6780582070350647,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.7946557998657227,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.708011269569397,
"adv/mean_abs_reasoning": 0.5233821868896484,
"adv/mean_abs_step_conf": 0.7319657206535339,
"adv/ratio_final_to_reasoning": 1.3527614949545013,
"adv/ratio_step_to_reasoning": 1.3985300588914844,
"adv/std_final_conf": 0.9187631011009216,
"adv/std_reasoning": 0.7928596138954163,
"adv/std_step_conf": 0.9353150725364685,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8377544529262086,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.13828685258964146,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4302788844621514,
"calib/gap": 0.4907907124681934,
"calib/mean_conf": 0.6067330677290836,
"calib/mu_c": 0.8413740458015267,
"calib/mu_w": 0.35058333333333336,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11155378486055781,
"calib/std_conf": 0.39990298183762135,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5631897711978466,
"calib/step_q_c_n": 743.0,
"calib/step_q_gap": 0.1772710720108548,
"calib/step_q_w": 0.38591869918699184,
"calib/step_q_w_n": 615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1956.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 547.1328125,
"completions/mean_terminated_length": 547.1328125,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.036939799785614014,
"kl": 0.121429443359375,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0261,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03212355449795723,
"mask/share_reasoning": 0.8532459735870361,
"mask/share_step_conf": 0.11463050544261932,
"num_tokens": 37266196.0,
"reward": 0.970373272895813,
"reward_std": 0.21288573741912842,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.8059629201889038,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8379085063934326,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.7646756768226624,
"adv/mean_abs_reasoning": 0.6326032876968384,
"adv/mean_abs_step_conf": 0.7455942034721375,
"adv/ratio_final_to_reasoning": 1.208776008115084,
"adv/ratio_step_to_reasoning": 1.178612596508426,
"adv/std_final_conf": 0.9190730452537537,
"adv/std_reasoning": 0.8590472340583801,
"adv/std_step_conf": 0.935570478439331,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6551655891278533,
"calib/avg_num_step_conf": 4.93359375,
"calib/ece": 0.24987951807228914,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4859437751004016,
"calib/gap": 0.18371684918854747,
"calib/mean_conf": 0.7106024096385543,
"calib/mu_c": 0.788811188811189,
"calib/mu_w": 0.6050943396226415,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.19309236947791164,
"calib/std_conf": 0.34552218579669786,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5519859154929578,
"calib/step_q_c_n": 710.0,
"calib/step_q_gap": 0.0916965845707155,
"calib/step_q_w": 0.46028933092224233,
"calib/step_q_w_n": 553.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2249.0,
"completions/max_terminated_length": 2249.0,
"completions/mean_length": 517.7265625,
"completions/mean_terminated_length": 517.7265625,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.0262977983802557,
"kl": 0.1364593505859375,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0338,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03399697691202164,
"mask/share_reasoning": 0.8510940670967102,
"mask/share_step_conf": 0.11490896344184875,
"num_tokens": 37503974.0,
"reward": 0.8746780157089233,
"reward_std": 0.24372267723083496,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.674838662147522,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.771392285823822,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.7417625188827515,
"adv/mean_abs_reasoning": 0.5368902683258057,
"adv/mean_abs_step_conf": 0.7408811450004578,
"adv/ratio_final_to_reasoning": 1.3815905458592173,
"adv/ratio_step_to_reasoning": 1.3799489182598903,
"adv/std_final_conf": 0.9166836738586426,
"adv/std_reasoning": 0.7928199768066406,
"adv/std_step_conf": 0.9350181221961975,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7813659247482776,
"calib/avg_num_step_conf": 4.91796875,
"calib/ece": 0.16489878542510122,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.3805668016194332,
"calib/gap": 0.37062930577636466,
"calib/mean_conf": 0.6106477732793523,
"calib/mu_c": 0.7772058823529412,
"calib/mu_w": 0.40657657657657653,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1124696356275304,
"calib/std_conf": 0.3754937173998822,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5579135618479881,
"calib/step_q_c_n": 671.0,
"calib/step_q_gap": 0.14823669109968884,
"calib/step_q_w": 0.4096768707482993,
"calib/step_q_w_n": 588.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1715.0,
"completions/max_terminated_length": 1715.0,
"completions/mean_length": 493.08203125,
"completions/mean_terminated_length": 495.0157165527344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1696,
"grad_norm": 0.04497016221284866,
"kl": 0.1365509033203125,
"learning_rate": 1.138888888888889e-06,
"loss": -0.0894,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.032456062734127045,
"mask/share_reasoning": 0.8510378003120422,
"mask/share_step_conf": 0.11259990930557251,
"num_tokens": 37734987.0,
"reward": 0.9372793436050415,
"reward_std": 0.2131517231464386,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7540343999862671,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8220866918563843,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.7424492835998535,
"adv/mean_abs_reasoning": 0.5817896127700806,
"adv/mean_abs_step_conf": 0.7358402013778687,
"adv/ratio_final_to_reasoning": 1.2761473689171288,
"adv/ratio_step_to_reasoning": 1.2647874510414607,
"adv/std_final_conf": 0.9117441177368164,
"adv/std_reasoning": 0.8267434239387512,
"adv/std_step_conf": 0.9357884526252747,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7352083747432585,
"calib/avg_num_step_conf": 4.9609375,
"calib/ece": 0.19443089430894323,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.3699186991869919,
"calib/gap": 0.3141780958060029,
"calib/mean_conf": 0.5548373983739838,
"calib/mu_c": 0.7042635658914729,
"calib/mu_w": 0.39008547008546995,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.11243902439024402,
"calib/std_conf": 0.39687572288358697,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5302148760330578,
"calib/step_q_c_n": 605.0,
"calib/step_q_gap": 0.1507111166345616,
"calib/step_q_w": 0.37950375939849623,
"calib/step_q_w_n": 665.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2315.0,
"completions/max_terminated_length": 2315.0,
"completions/mean_length": 554.3203125,
"completions/mean_terminated_length": 554.3203125,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.02868187241256237,
"kl": 0.1332244873046875,
"learning_rate": 1.111111111111111e-06,
"loss": -0.0085,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.029985826462507248,
"mask/share_reasoning": 0.8714199662208557,
"mask/share_step_conf": 0.09859418123960495,
"num_tokens": 37981733.0,
"reward": 0.8914778828620911,
"reward_std": 0.25153106451034546,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6966515779495239,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.7980228066444397,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.7147070169448853,
"adv/mean_abs_reasoning": 0.47162288427352905,
"adv/mean_abs_step_conf": 0.7598281502723694,
"adv/ratio_final_to_reasoning": 1.515420563287115,
"adv/ratio_step_to_reasoning": 1.6110926242325612,
"adv/std_final_conf": 0.8813939690589905,
"adv/std_reasoning": 0.7208422422409058,
"adv/std_step_conf": 0.9348978996276855,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.786046511627907,
"calib/avg_num_step_conf": 4.71484375,
"calib/ece": 0.18036585365853652,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.3699186991869919,
"calib/gap": 0.39735755813953494,
"calib/mean_conf": 0.5832113821138211,
"calib/mu_c": 0.722125,
"calib/mu_w": 0.3247674418604651,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.056585365853658504,
"calib/std_conf": 0.3924985785698982,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5363151041666666,
"calib/step_q_c_n": 768.0,
"calib/step_q_gap": 0.12002808822133632,
"calib/step_q_w": 0.4162870159453303,
"calib/step_q_w_n": 439.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2155.0,
"completions/max_terminated_length": 2155.0,
"completions/mean_length": 510.08203125,
"completions/mean_terminated_length": 510.08203125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.04517321288585663,
"kl": 0.13018798828125,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0389,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03298802673816681,
"mask/share_reasoning": 0.8571385741233826,
"mask/share_step_conf": 0.10987342894077301,
"num_tokens": 38216234.0,
"reward": 0.9338958859443665,
"reward_std": 0.20754006505012512,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.756743311882019,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.7946420907974243,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.7190934419631958,
"adv/mean_abs_reasoning": 0.580523669719696,
"adv/mean_abs_step_conf": 0.7531288862228394,
"adv/ratio_final_to_reasoning": 1.2386978851532577,
"adv/ratio_step_to_reasoning": 1.2973267508394364,
"adv/std_final_conf": 0.8893725872039795,
"adv/std_reasoning": 0.809906005859375,
"adv/std_step_conf": 0.9347851276397705,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7156410071423418,
"calib/avg_num_step_conf": 4.7109375,
"calib/ece": 0.21260000000000007,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.452,
"calib/gap": 0.275508982035928,
"calib/mean_conf": 0.6440400000000001,
"calib/mu_c": 0.7355089820359281,
"calib/mu_w": 0.4600000000000001,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.09432000000000004,
"calib/std_conf": 0.3745665206608835,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5134777376654633,
"calib/step_q_c_n": 831.0,
"calib/step_q_gap": 0.062251070998796676,
"calib/step_q_w": 0.45122666666666666,
"calib/step_q_w_n": 375.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1795.0,
"completions/max_terminated_length": 1795.0,
"completions/mean_length": 495.19921875,
"completions/mean_terminated_length": 495.19921875,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.1728,
"grad_norm": 0.027035508304834366,
"kl": 0.135833740234375,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.0237,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03328103572130203,
"mask/share_reasoning": 0.8577634692192078,
"mask/share_step_conf": 0.1089554876089096,
"num_tokens": 38447149.0,
"reward": 0.9405144453048706,
"reward_std": 0.21334876120090485,
"rewards/accuracy_reward_step": 0.65234375,
"rewards/final_brier_reward_step": 0.7292284965515137,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8291440010070801,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.7169132232666016,
"adv/mean_abs_reasoning": 0.6146094799041748,
"adv/mean_abs_step_conf": 0.7588496208190918,
"adv/ratio_final_to_reasoning": 1.1664532466670985,
"adv/ratio_step_to_reasoning": 1.2346858381315657,
"adv/std_final_conf": 0.9145965576171875,
"adv/std_reasoning": 0.8429942727088928,
"adv/std_step_conf": 0.9352772831916809,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7947131608548932,
"calib/avg_num_step_conf": 5.6796875,
"calib/ece": 0.17284552845528456,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.34959349593495936,
"calib/gap": 0.3886931780586249,
"calib/mean_conf": 0.5401626016260163,
"calib/mu_c": 0.7281889763779527,
"calib/mu_w": 0.33949579831932775,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.0983739837398374,
"calib/std_conf": 0.3966280306520489,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5100657894736842,
"calib/step_q_c_n": 608.0,
"calib/step_q_gap": 0.18380101405997262,
"calib/step_q_w": 0.32626477541371157,
"calib/step_q_w_n": 846.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2221.0,
"completions/max_terminated_length": 2221.0,
"completions/mean_length": 586.7109375,
"completions/mean_terminated_length": 586.7109375,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.028267567977309227,
"kl": 0.12030029296875,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.0168,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.031356584280729294,
"mask/share_reasoning": 0.8597544431686401,
"mask/share_step_conf": 0.10888896882534027,
"num_tokens": 38702179.0,
"reward": 0.9220717549324036,
"reward_std": 0.22672028839588165,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7462793588638306,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8088016510009766,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.7333135008811951,
"adv/mean_abs_reasoning": 0.6133211255073547,
"adv/mean_abs_step_conf": 0.763792872428894,
"adv/ratio_final_to_reasoning": 1.1956436365608303,
"adv/ratio_step_to_reasoning": 1.245339253228992,
"adv/std_final_conf": 0.899974524974823,
"adv/std_reasoning": 0.8268165588378906,
"adv/std_step_conf": 0.9351148009300232,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.8030457197123865,
"calib/avg_num_step_conf": 5.27734375,
"calib/ece": 0.1518518518518519,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.3004115226337449,
"calib/gap": 0.38663614163614163,
"calib/mean_conf": 0.5279835390946501,
"calib/mu_c": 0.7284615384615385,
"calib/mu_w": 0.34182539682539687,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.09917695473251034,
"calib/std_conf": 0.3845199665392035,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5197297297297298,
"calib/step_q_c_n": 592.0,
"calib/step_q_gap": 0.1706783463305202,
"calib/step_q_w": 0.34905138339920955,
"calib/step_q_w_n": 759.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2259.0,
"completions/max_terminated_length": 2259.0,
"completions/mean_length": 588.58984375,
"completions/mean_terminated_length": 588.58984375,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.023231087252497673,
"kl": 0.1348876953125,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0142,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.027646934613585472,
"mask/share_reasoning": 0.8728476166725159,
"mask/share_step_conf": 0.09950542449951172,
"num_tokens": 38958994.0,
"reward": 0.9149197340011597,
"reward_std": 0.24809305369853973,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7365156412124634,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.8151988387107849,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.747089684009552,
"adv/mean_abs_reasoning": 0.5145565271377563,
"adv/mean_abs_step_conf": 0.730810284614563,
"adv/ratio_final_to_reasoning": 1.4519098380993662,
"adv/ratio_step_to_reasoning": 1.4202721101989082,
"adv/std_final_conf": 0.9169217348098755,
"adv/std_reasoning": 0.7754154801368713,
"adv/std_step_conf": 0.9349702000617981,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7628610261637784,
"calib/avg_num_step_conf": 4.93359375,
"calib/ece": 0.18192622950819667,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.35655737704918034,
"calib/gap": 0.39493510023785255,
"calib/mean_conf": 0.5325,
"calib/mu_c": 0.7510091743119266,
"calib/mu_w": 0.35607407407407404,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.13385245901639342,
"calib/std_conf": 0.4102656826105114,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5064244741873806,
"calib/step_q_c_n": 523.0,
"calib/step_q_gap": 0.11672177148467788,
"calib/step_q_w": 0.3897027027027027,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1980.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 564.90625,
"completions/mean_terminated_length": 564.90625,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.176,
"grad_norm": 0.028785517439246178,
"kl": 0.135772705078125,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0701,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.02926689013838768,
"mask/share_reasoning": 0.8692151308059692,
"mask/share_step_conf": 0.10151800513267517,
"num_tokens": 39209186.0,
"reward": 0.9045617580413818,
"reward_std": 0.2308696061372757,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.7248390913009644,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8108468651771545,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.7528688907623291,
"adv/mean_abs_reasoning": 0.48331791162490845,
"adv/mean_abs_step_conf": 0.7518739700317383,
"adv/ratio_final_to_reasoning": 1.5577094758007082,
"adv/ratio_step_to_reasoning": 1.555650953435489,
"adv/std_final_conf": 0.9187546372413635,
"adv/std_reasoning": 0.7395718693733215,
"adv/std_step_conf": 0.9352030754089355,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7663355408388521,
"calib/avg_num_step_conf": 5.38671875,
"calib/ece": 0.1821991701244813,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.4190871369294606,
"calib/gap": 0.4205172921265635,
"calib/mean_conf": 0.5820331950207468,
"calib/mu_c": 0.7390728476821191,
"calib/mu_w": 0.3185555555555556,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.06883817427385891,
"calib/std_conf": 0.4090059994814056,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.4997358943577431,
"calib/step_q_c_n": 833.0,
"calib/step_q_gap": 0.13204358666543548,
"calib/step_q_w": 0.36769230769230765,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2181.0,
"completions/max_terminated_length": 2181.0,
"completions/mean_length": 581.94921875,
"completions/mean_terminated_length": 581.94921875,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.04616737365722656,
"kl": 0.11749267578125,
"learning_rate": 9.444444444444445e-07,
"loss": -0.0654,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.02924707904458046,
"mask/share_reasoning": 0.8627008199691772,
"mask/share_step_conf": 0.1080520898103714,
"num_tokens": 39464349.0,
"reward": 0.924762487411499,
"reward_std": 0.23554068803787231,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7360988855361938,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.80951988697052,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.7144575119018555,
"adv/mean_abs_reasoning": 0.5261666178703308,
"adv/mean_abs_step_conf": 0.7523694634437561,
"adv/ratio_final_to_reasoning": 1.3578541238393944,
"adv/ratio_step_to_reasoning": 1.4299072535026747,
"adv/std_final_conf": 0.882361888885498,
"adv/std_reasoning": 0.7754969596862793,
"adv/std_step_conf": 0.9349962472915649,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7076388888888889,
"calib/avg_num_step_conf": 5.01171875,
"calib/ece": 0.19746987951807232,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.46586345381526106,
"calib/gap": 0.286376984126984,
"calib/mean_conf": 0.677710843373494,
"calib/mu_c": 0.7984722222222221,
"calib/mu_w": 0.5120952380952382,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1484337349397591,
"calib/std_conf": 0.36235727106055615,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.512751677852349,
"calib/step_q_c_n": 745.0,
"calib/step_q_gap": 0.07468476335420776,
"calib/step_q_w": 0.4380669144981412,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1876.0,
"completions/max_terminated_length": 1876.0,
"completions/mean_length": 519.20703125,
"completions/mean_terminated_length": 519.20703125,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.046507444232702255,
"kl": 0.1273956298828125,
"learning_rate": 9.166666666666666e-07,
"loss": -0.072,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03145875409245491,
"mask/share_reasoning": 0.864819347858429,
"mask/share_step_conf": 0.10372191667556763,
"num_tokens": 39702874.0,
"reward": 0.9286771416664124,
"reward_std": 0.20551800727844238,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.7300738096237183,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8218116760253906,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.7601954936981201,
"adv/mean_abs_reasoning": 0.7227966785430908,
"adv/mean_abs_step_conf": 0.733101487159729,
"adv/ratio_final_to_reasoning": 1.0517418193321149,
"adv/ratio_step_to_reasoning": 1.0142568566272456,
"adv/std_final_conf": 0.9152436256408691,
"adv/std_reasoning": 0.9055777788162231,
"adv/std_step_conf": 0.935141921043396,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7616805635422657,
"calib/avg_num_step_conf": 5.75390625,
"calib/ece": 0.167603305785124,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.34710743801652894,
"calib/gap": 0.35553910293271984,
"calib/mean_conf": 0.5681818181818182,
"calib/mu_c": 0.7062837837837838,
"calib/mu_w": 0.3507446808510639,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.062107438016528946,
"calib/std_conf": 0.386343314973604,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5048571428571428,
"calib/step_q_c_n": 805.0,
"calib/step_q_gap": 0.1718331907613344,
"calib/step_q_w": 0.3330239520958084,
"calib/step_q_w_n": 668.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2926.0,
"completions/max_terminated_length": 2926.0,
"completions/mean_length": 628.80078125,
"completions/mean_terminated_length": 628.80078125,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.1792,
"grad_norm": 0.02880941890180111,
"kl": 0.1226959228515625,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0246,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.02778783068060875,
"mask/share_reasoning": 0.8679161667823792,
"mask/share_step_conf": 0.104296013712883,
"num_tokens": 39968519.0,
"reward": 0.9214756488800049,
"reward_std": 0.24216872453689575,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.7268156409263611,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.8130106925964355,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.7137185335159302,
"adv/mean_abs_reasoning": 0.45779332518577576,
"adv/mean_abs_step_conf": 0.7305762767791748,
"adv/ratio_final_to_reasoning": 1.5590409345227962,
"adv/ratio_step_to_reasoning": 1.5958648512026727,
"adv/std_final_conf": 0.8747919797897339,
"adv/std_reasoning": 0.7394520044326782,
"adv/std_step_conf": 0.9345008134841919,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7390500767192077,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.203402489626556,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.42738589211618255,
"calib/gap": 0.3206653647649603,
"calib/mean_conf": 0.6016597510373444,
"calib/mu_c": 0.7440298507462687,
"calib/mu_w": 0.4233644859813084,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12452282157676348,
"calib/std_conf": 0.3937295662857602,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5413274336283186,
"calib/step_q_c_n": 678.0,
"calib/step_q_gap": 0.16220978656949508,
"calib/step_q_w": 0.3791176470588235,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2258.0,
"completions/max_terminated_length": 2258.0,
"completions/mean_length": 574.625,
"completions/mean_terminated_length": 574.625,
"completions/min_length": 149.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.03632810339331627,
"kl": 0.124847412109375,
"learning_rate": 8.611111111111112e-07,
"loss": -0.035,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.02968929149210453,
"mask/share_reasoning": 0.8687570691108704,
"mask/share_step_conf": 0.10155363380908966,
"num_tokens": 40219807.0,
"reward": 0.9066611528396606,
"reward_std": 0.2341693490743637,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7024269700050354,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.8194890022277832,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.7154579162597656,
"adv/mean_abs_reasoning": 0.5741154551506042,
"adv/mean_abs_step_conf": 0.7530225515365601,
"adv/ratio_final_to_reasoning": 1.246191702106476,
"adv/ratio_step_to_reasoning": 1.3116221567994963,
"adv/std_final_conf": 0.8930612802505493,
"adv/std_reasoning": 0.8098737001419067,
"adv/std_step_conf": 0.9352511763572693,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7849570585077832,
"calib/avg_num_step_conf": 4.94140625,
"calib/ece": 0.15979674796747967,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4268292682926829,
"calib/gap": 0.41097826086956524,
"calib/mean_conf": 0.6163821138211382,
"calib/mu_c": 0.7968115942028986,
"calib/mu_w": 0.38583333333333336,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.10760162601626017,
"calib/std_conf": 0.39772973665533484,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5032117812061712,
"calib/step_q_c_n": 713.0,
"calib/step_q_gap": 0.12346540439457693,
"calib/step_q_w": 0.37974637681159423,
"calib/step_q_w_n": 552.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2128.0,
"completions/max_terminated_length": 2128.0,
"completions/mean_length": 582.06640625,
"completions/mean_terminated_length": 582.06640625,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.030894074589014053,
"kl": 0.1164398193359375,
"learning_rate": 8.333333333333333e-07,
"loss": -0.083,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.028358610346913338,
"mask/share_reasoning": 0.8738337755203247,
"mask/share_step_conf": 0.09780760109424591,
"num_tokens": 40472968.0,
"reward": 0.9447938799858093,
"reward_std": 0.2374449223279953,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7596187591552734,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8315315246582031,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.7335254549980164,
"adv/mean_abs_reasoning": 0.5795778036117554,
"adv/mean_abs_step_conf": 0.7598888874053955,
"adv/ratio_final_to_reasoning": 1.2656203367811971,
"adv/ratio_step_to_reasoning": 1.3111076419248553,
"adv/std_final_conf": 0.9092419743537903,
"adv/std_reasoning": 0.8100183010101318,
"adv/std_step_conf": 0.9349356889724731,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7129726205997393,
"calib/avg_num_step_conf": 4.97265625,
"calib/ece": 0.2564112903225807,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.3870967741935484,
"calib/gap": 0.28493741851368964,
"calib/mean_conf": 0.5499596774193548,
"calib/mu_c": 0.699322033898305,
"calib/mu_w": 0.4143846153846154,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.16528225806451619,
"calib/std_conf": 0.41306632481560407,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5213624124932688,
"calib/step_q_c_n": 619.0,
"calib/step_q_gap": 0.13478748894586817,
"calib/step_q_w": 0.3865749235474006,
"calib/step_q_w_n": 654.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1631.0,
"completions/max_terminated_length": 1631.0,
"completions/mean_length": 512.66796875,
"completions/mean_terminated_length": 512.66796875,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.1824,
"grad_norm": 0.02686500735580921,
"kl": 0.128143310546875,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0714,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.031494684517383575,
"mask/share_reasoning": 0.863057553768158,
"mask/share_step_conf": 0.10544778406620026,
"num_tokens": 40711107.0,
"reward": 0.8865392208099365,
"reward_std": 0.2288045734167099,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6851121187210083,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8028100728988647,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.6808489561080933,
"adv/mean_abs_reasoning": 0.5288216471672058,
"adv/mean_abs_step_conf": 0.7524287700653076,
"adv/ratio_final_to_reasoning": 1.2874831424834214,
"adv/ratio_step_to_reasoning": 1.422840335859777,
"adv/std_final_conf": 0.8914951682090759,
"adv/std_reasoning": 0.7754030823707581,
"adv/std_step_conf": 0.9347220063209534,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7453017832647463,
"calib/avg_num_step_conf": 4.90234375,
"calib/ece": 0.1977380952380953,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.48412698412698413,
"calib/gap": 0.28341975308641987,
"calib/mean_conf": 0.7096428571428572,
"calib/mu_c": 0.8108641975308643,
"calib/mu_w": 0.5274444444444444,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.13226190476190483,
"calib/std_conf": 0.3410526879252575,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5031578947368421,
"calib/step_q_c_n": 798.0,
"calib/step_q_gap": 0.06532419670620754,
"calib/step_q_w": 0.4378336980306346,
"calib/step_q_w_n": 457.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 500.8203125,
"completions/mean_terminated_length": 500.8203125,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.039231326431035995,
"kl": 0.140899658203125,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0453,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033453211188316345,
"mask/share_reasoning": 0.855216383934021,
"mask/share_step_conf": 0.11133037507534027,
"num_tokens": 40942669.0,
"reward": 0.9580105543136597,
"reward_std": 0.19059154391288757,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.7589675784111023,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8359596729278564,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.7563506960868835,
"adv/mean_abs_reasoning": 0.640105128288269,
"adv/mean_abs_step_conf": 0.7533974051475525,
"adv/ratio_final_to_reasoning": 1.1816038688980222,
"adv/ratio_step_to_reasoning": 1.17699010967502,
"adv/std_final_conf": 0.9212831854820251,
"adv/std_reasoning": 0.8430480360984802,
"adv/std_step_conf": 0.9354016780853271,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7427426416110999,
"calib/avg_num_step_conf": 5.15234375,
"calib/ece": 0.21116935483870966,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.6129032258064516,
"calib/gap": 0.2908378797063381,
"calib/mean_conf": 0.7775403225806451,
"calib/mu_c": 0.8959863945578231,
"calib/mu_w": 0.605148514851485,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.1979838709677419,
"calib/std_conf": 0.3265669397168442,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5286137281292059,
"calib/step_q_c_n": 743.0,
"calib/step_q_gap": 0.0913915059069837,
"calib/step_q_w": 0.43722222222222223,
"calib/step_q_w_n": 576.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1986.0,
"completions/max_terminated_length": 1986.0,
"completions/mean_length": 547.046875,
"completions/mean_terminated_length": 547.046875,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.0344572588801384,
"kl": 0.128143310546875,
"learning_rate": 7.5e-07,
"loss": -0.0827,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03216134011745453,
"mask/share_reasoning": 0.8585292100906372,
"mask/share_step_conf": 0.10930944979190826,
"num_tokens": 41185873.0,
"reward": 0.9170801639556885,
"reward_std": 0.2532092332839966,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7265589237213135,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8013514280319214,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.7960557341575623,
"adv/mean_abs_reasoning": 0.6628378629684448,
"adv/mean_abs_step_conf": 0.7811141610145569,
"adv/ratio_final_to_reasoning": 1.2009810824513194,
"adv/ratio_step_to_reasoning": 1.1784392604194711,
"adv/std_final_conf": 0.9070166349411011,
"adv/std_reasoning": 0.8432024717330933,
"adv/std_step_conf": 0.9354541897773743,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.5976460331299042,
"calib/avg_num_step_conf": 5.31640625,
"calib/ece": 0.3175319148936169,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.3574468085106383,
"calib/gap": 0.12810883464109252,
"calib/mean_conf": 0.5589787234042554,
"calib/mu_c": 0.6265765765765764,
"calib/mu_w": 0.4984677419354839,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.2020851063829786,
"calib/std_conf": 0.39205923832479334,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.46542757417102976,
"calib/step_q_c_n": 573.0,
"calib/step_q_gap": 0.06306716807965917,
"calib/step_q_w": 0.4023604060913706,
"calib/step_q_w_n": 788.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2184.0,
"completions/max_terminated_length": 2184.0,
"completions/mean_length": 612.0703125,
"completions/mean_terminated_length": 614.4706420898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1856,
"grad_norm": 0.03898928686976433,
"kl": 0.1152191162109375,
"learning_rate": 7.222222222222222e-07,
"loss": -0.0874,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.026619601994752884,
"mask/share_reasoning": 0.8688848614692688,
"mask/share_step_conf": 0.10058927536010742,
"num_tokens": 41446795.0,
"reward": 0.7974745035171509,
"reward_std": 0.2710039019584656,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.5851151943206787,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l2_reward": 0.7434275150299072,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.7434352040290833,
"adv/mean_abs_reasoning": 0.5331301689147949,
"adv/mean_abs_step_conf": 0.7573065757751465,
"adv/ratio_final_to_reasoning": 1.3944722084333956,
"adv/ratio_step_to_reasoning": 1.4204909418588532,
"adv/std_final_conf": 0.9098041653633118,
"adv/std_reasoning": 0.7928568720817566,
"adv/std_step_conf": 0.9353412985801697,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7717921146953404,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.19048979591836732,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.3346938775510204,
"calib/gap": 0.41721863799283143,
"calib/mean_conf": 0.4824897959183674,
"calib/mu_c": 0.7464444444444444,
"calib/mu_w": 0.3292258064516129,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.15281632653061222,
"calib/std_conf": 0.41405107258912305,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5602347417840375,
"calib/step_q_c_n": 426.0,
"calib/step_q_gap": 0.19993202779656366,
"calib/step_q_w": 0.3603027139874739,
"calib/step_q_w_n": 958.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2505.0,
"completions/max_terminated_length": 2505.0,
"completions/mean_length": 574.67578125,
"completions/mean_terminated_length": 576.929443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.033894505351781845,
"kl": 0.120086669921875,
"learning_rate": 6.944444444444446e-07,
"loss": -0.0982,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.028536062687635422,
"mask/share_reasoning": 0.8603699803352356,
"mask/share_step_conf": 0.10718771815299988,
"num_tokens": 41699736.0,
"reward": 0.903607964515686,
"reward_std": 0.2339988648891449,
"rewards/accuracy_reward_step": 0.35546875,
"rewards/final_brier_reward_step": 0.7357914447784424,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.810486912727356,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.7065272927284241,
"adv/mean_abs_reasoning": 0.5671974420547485,
"adv/mean_abs_step_conf": 0.7515043020248413,
"adv/ratio_final_to_reasoning": 1.245646119575107,
"adv/ratio_step_to_reasoning": 1.3249430380052782,
"adv/std_final_conf": 0.8785935640335083,
"adv/std_reasoning": 0.8099661469459534,
"adv/std_step_conf": 0.9349579215049744,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7959815546772069,
"calib/avg_num_step_conf": 5.3359375,
"calib/ece": 0.1721862348178138,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.48582995951417,
"calib/gap": 0.39831620553359687,
"calib/mean_conf": 0.6418218623481781,
"calib/mu_c": 0.8272727272727273,
"calib/mu_w": 0.4289565217391304,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.13979757085020245,
"calib/std_conf": 0.39943421606484,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5598706896551724,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.16825874935666496,
"calib/step_q_w": 0.39161194029850743,
"calib/step_q_w_n": 670.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2210.0,
"completions/max_terminated_length": 2210.0,
"completions/mean_length": 547.34375,
"completions/mean_terminated_length": 547.34375,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.02775077521800995,
"kl": 0.1219940185546875,
"learning_rate": 6.666666666666667e-07,
"loss": -0.048,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.031332723796367645,
"mask/share_reasoning": 0.8548117876052856,
"mask/share_step_conf": 0.1138555184006691,
"num_tokens": 41943920.0,
"reward": 0.9143821001052856,
"reward_std": 0.24923500418663025,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7271945476531982,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8101633787155151,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.7506808042526245,
"adv/mean_abs_reasoning": 0.5949095487594604,
"adv/mean_abs_step_conf": 0.7533080577850342,
"adv/ratio_final_to_reasoning": 1.2618402340624508,
"adv/ratio_step_to_reasoning": 1.2662564575671635,
"adv/std_final_conf": 0.9032699465751648,
"adv/std_reasoning": 0.8100223541259766,
"adv/std_step_conf": 0.9353511333465576,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7749007936507937,
"calib/avg_num_step_conf": 5.15625,
"calib/ece": 0.17178861788617883,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.4186991869918699,
"calib/gap": 0.40023412698412675,
"calib/mean_conf": 0.5900813008130081,
"calib/mu_c": 0.7853174603174601,
"calib/mu_w": 0.38508333333333333,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.12483739837398371,
"calib/std_conf": 0.4068688607069138,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5241347626339969,
"calib/step_q_c_n": 653.0,
"calib/step_q_gap": 0.12902231885588605,
"calib/step_q_w": 0.3951124437781109,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2030.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 542.515625,
"completions/mean_terminated_length": 546.7874145507812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.1888,
"grad_norm": 0.04885758087038994,
"kl": 0.118408203125,
"learning_rate": 6.388888888888889e-07,
"loss": -0.1204,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03018646314740181,
"mask/share_reasoning": 0.8533045053482056,
"mask/share_step_conf": 0.10869648307561874,
"num_tokens": 42186636.0,
"reward": 0.9126665592193604,
"reward_std": 0.23725032806396484,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.733467161655426,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8043658137321472,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.712067186832428,
"adv/mean_abs_reasoning": 0.5390047430992126,
"adv/mean_abs_step_conf": 0.7495079636573792,
"adv/ratio_final_to_reasoning": 1.3210777751938267,
"adv/ratio_step_to_reasoning": 1.390540571772705,
"adv/std_final_conf": 0.8925026059150696,
"adv/std_reasoning": 0.7754620909690857,
"adv/std_step_conf": 0.9346250891685486,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.8412121212121212,
"calib/avg_num_step_conf": 5.3125,
"calib/ece": 0.12073469387755098,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.44081632653061226,
"calib/gap": 0.47537710437710434,
"calib/mean_conf": 0.6293061224489797,
"calib/mu_c": 0.8427407407407408,
"calib/mu_w": 0.36736363636363645,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.09951020408163262,
"calib/std_conf": 0.38333542641685114,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5588975155279502,
"calib/step_q_c_n": 644.0,
"calib/step_q_gap": 0.19666287865644178,
"calib/step_q_w": 0.3622346368715084,
"calib/step_q_w_n": 716.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2247.0,
"completions/max_terminated_length": 2247.0,
"completions/mean_length": 527.1328125,
"completions/mean_terminated_length": 527.1328125,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.03510009124875069,
"kl": 0.1291351318359375,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0409,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03149116411805153,
"mask/share_reasoning": 0.8561463356018066,
"mask/share_step_conf": 0.11236252635717392,
"num_tokens": 42427654.0,
"reward": 0.9619373083114624,
"reward_std": 0.21293267607688904,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7950069904327393,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8304301500320435,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.7379262447357178,
"adv/mean_abs_reasoning": 0.6535248756408691,
"adv/mean_abs_step_conf": 0.723167359828949,
"adv/ratio_final_to_reasoning": 1.1291479058269691,
"adv/ratio_step_to_reasoning": 1.1065643968330754,
"adv/std_final_conf": 0.9199758172035217,
"adv/std_reasoning": 0.8747087717056274,
"adv/std_step_conf": 0.9348064064979553,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7792224935647811,
"calib/avg_num_step_conf": 5.5234375,
"calib/ece": 0.13942204301075267,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4153225806451613,
"calib/gap": 0.4033064924207423,
"calib/mean_conf": 0.6219489247311828,
"calib/mu_c": 0.7992086330935252,
"calib/mu_w": 0.3959021406727829,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.10044354838709676,
"calib/std_conf": 0.3861761245642092,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.503463476070529,
"calib/step_q_c_n": 794.0,
"calib/step_q_gap": 0.11443121800601286,
"calib/step_q_w": 0.38903225806451613,
"calib/step_q_w_n": 620.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1900.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 550.66796875,
"completions/mean_terminated_length": 552.8274536132812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.03115103580057621,
"kl": 0.1255645751953125,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0055,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.030252009630203247,
"mask/share_reasoning": 0.8536373972892761,
"mask/share_step_conf": 0.11220435798168182,
"num_tokens": 42674889.0,
"reward": 0.9555507302284241,
"reward_std": 0.22918008267879486,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.770561695098877,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8389773368835449,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.7011657953262329,
"adv/mean_abs_reasoning": 0.5596412420272827,
"adv/mean_abs_step_conf": 0.7473223209381104,
"adv/ratio_final_to_reasoning": 1.2528844242898933,
"adv/ratio_step_to_reasoning": 1.3353596283057318,
"adv/std_final_conf": 0.8937993049621582,
"adv/std_reasoning": 0.8098291158676147,
"adv/std_step_conf": 0.9352113604545593,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7252700210748156,
"calib/avg_num_step_conf": 5.0703125,
"calib/ece": 0.20264,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.436,
"calib/gap": 0.2982336670179136,
"calib/mean_conf": 0.62888,
"calib/mu_c": 0.7529452054794521,
"calib/mu_w": 0.4547115384615385,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.12376000000000001,
"calib/std_conf": 0.38166208300013243,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5047162162162162,
"calib/step_q_c_n": 740.0,
"calib/step_q_gap": 0.07845367141334875,
"calib/step_q_w": 0.42626254480286746,
"calib/step_q_w_n": 558.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2238.0,
"completions/max_terminated_length": 2238.0,
"completions/mean_length": 600.7265625,
"completions/mean_terminated_length": 603.0823974609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.192,
"grad_norm": 0.030468562617897987,
"kl": 0.122283935546875,
"learning_rate": 5.555555555555555e-07,
"loss": -0.1283,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02861013635993004,
"mask/share_reasoning": 0.8696733117103577,
"mask/share_step_conf": 0.09781032800674438,
"num_tokens": 42932531.0,
"reward": 0.9047421216964722,
"reward_std": 0.2203877568244934,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7101773023605347,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7954006195068359,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.7361574172973633,
"adv/mean_abs_reasoning": 0.5702579021453857,
"adv/mean_abs_step_conf": 0.7411701083183289,
"adv/ratio_final_to_reasoning": 1.290920151264615,
"adv/ratio_step_to_reasoning": 1.299710368817247,
"adv/std_final_conf": 0.9074108600616455,
"adv/std_reasoning": 0.8100024461746216,
"adv/std_step_conf": 0.9351939558982849,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.8395074432324895,
"calib/avg_num_step_conf": 4.76171875,
"calib/ece": 0.14921487603305783,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.384297520661157,
"calib/gap": 0.4427721753447212,
"calib/mean_conf": 0.590702479338843,
"calib/mu_c": 0.8267256637168142,
"calib/mu_w": 0.38395348837209303,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.13648760330578508,
"calib/std_conf": 0.39298066624636846,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5355160142348754,
"calib/step_q_c_n": 562.0,
"calib/step_q_gap": 0.1532176885118921,
"calib/step_q_w": 0.3822983257229833,
"calib/step_q_w_n": 657.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2152.0,
"completions/max_terminated_length": 2152.0,
"completions/mean_length": 516.21875,
"completions/mean_terminated_length": 516.21875,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.03454497084021568,
"kl": 0.1400146484375,
"learning_rate": 5.277777777777779e-07,
"loss": -0.1265,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.030553974211215973,
"mask/share_reasoning": 0.8623343706130981,
"mask/share_step_conf": 0.10711166262626648,
"num_tokens": 43170947.0,
"reward": 0.9192020297050476,
"reward_std": 0.24850967526435852,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.7540082335472107,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8078334331512451,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.749147891998291,
"adv/mean_abs_reasoning": 0.5334514379501343,
"adv/mean_abs_step_conf": 0.7665261030197144,
"adv/ratio_final_to_reasoning": 1.4043413115109449,
"adv/ratio_step_to_reasoning": 1.4369182431398138,
"adv/std_final_conf": 0.8957116007804871,
"adv/std_reasoning": 0.7754759192466736,
"adv/std_step_conf": 0.9350084662437439,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7560219794262346,
"calib/avg_num_step_conf": 5.21484375,
"calib/ece": 0.17150793650793655,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4642857142857143,
"calib/gap": 0.34841863139735474,
"calib/mean_conf": 0.6640476190476191,
"calib/mu_c": 0.8175177304964538,
"calib/mu_w": 0.4690990990990991,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1380158730158731,
"calib/std_conf": 0.37316679655334634,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5327656675749318,
"calib/step_q_c_n": 734.0,
"calib/step_q_gap": 0.12168413679290185,
"calib/step_q_w": 0.41108153078202997,
"calib/step_q_w_n": 601.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2878.0,
"completions/max_terminated_length": 2878.0,
"completions/mean_length": 523.50390625,
"completions/mean_terminated_length": 523.50390625,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.035674456506967545,
"kl": 0.13336181640625,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0079,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03135332465171814,
"mask/share_reasoning": 0.8553948998451233,
"mask/share_step_conf": 0.11325179040431976,
"num_tokens": 43411124.0,
"reward": 0.9420223236083984,
"reward_std": 0.22298182547092438,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7525800466537476,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8275582194328308,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.7440370321273804,
"adv/mean_abs_reasoning": 0.666670560836792,
"adv/mean_abs_step_conf": 0.7296310663223267,
"adv/ratio_final_to_reasoning": 1.1160490290638896,
"adv/ratio_step_to_reasoning": 1.0944402065789554,
"adv/std_final_conf": 0.9097049236297607,
"adv/std_reasoning": 0.8749170899391174,
"adv/std_step_conf": 0.935576319694519,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7327666666666666,
"calib/avg_num_step_conf": 4.421875,
"calib/ece": 0.2053061224489796,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.39591836734693875,
"calib/gap": 0.31537,
"calib/mean_conf": 0.5926530612244898,
"calib/mu_c": 0.74712,
"calib/mu_w": 0.43175,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1438775510204082,
"calib/std_conf": 0.391795320940714,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.525043630017452,
"calib/step_q_c_n": 573.0,
"calib/step_q_gap": 0.10416706472228204,
"calib/step_q_w": 0.42087656529517,
"calib/step_q_w_n": 559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1791.0,
"completions/max_terminated_length": 1791.0,
"completions/mean_length": 562.40625,
"completions/mean_terminated_length": 564.61181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.1952,
"grad_norm": 0.041152384132146835,
"kl": 0.1201171875,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.1033,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.029298175126314163,
"mask/share_reasoning": 0.8750410079956055,
"mask/share_step_conf": 0.09175451099872589,
"num_tokens": 43661780.0,
"reward": 0.8991619944572449,
"reward_std": 0.2737298905849457,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7152284979820251,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7948142290115356,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.7543465495109558,
"adv/mean_abs_reasoning": 0.5843380689620972,
"adv/mean_abs_step_conf": 0.7728927731513977,
"adv/ratio_final_to_reasoning": 1.290941989884089,
"adv/ratio_step_to_reasoning": 1.3226808489890312,
"adv/std_final_conf": 0.9035541415214539,
"adv/std_reasoning": 0.7930352687835693,
"adv/std_step_conf": 0.9352104663848877,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6918325326012355,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.2148995983935742,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5742971887550201,
"calib/gap": 0.25845710363761143,
"calib/mean_conf": 0.7393975903614458,
"calib/mu_c": 0.8369677419354838,
"calib/mu_w": 0.5785106382978724,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16590361445783122,
"calib/std_conf": 0.3532716262621504,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5172530120481927,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.1111004696753114,
"calib/step_q_w": 0.40615254237288134,
"calib/step_q_w_n": 590.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2478.0,
"completions/max_terminated_length": 2478.0,
"completions/mean_length": 557.3359375,
"completions/mean_terminated_length": 557.3359375,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.042663298547267914,
"kl": 0.132781982421875,
"learning_rate": 4.444444444444445e-07,
"loss": -0.0198,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.029847152531147003,
"mask/share_reasoning": 0.8627386093139648,
"mask/share_step_conf": 0.10741420835256577,
"num_tokens": 43909738.0,
"reward": 0.9294754266738892,
"reward_std": 0.23524896800518036,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7246004343032837,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8202879428863525,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.7113676071166992,
"adv/mean_abs_reasoning": 0.5241381525993347,
"adv/mean_abs_step_conf": 0.7602637410163879,
"adv/ratio_final_to_reasoning": 1.3572139398531589,
"adv/ratio_step_to_reasoning": 1.4505025769371038,
"adv/std_final_conf": 0.9085453748703003,
"adv/std_reasoning": 0.7929010391235352,
"adv/std_step_conf": 0.9355183839797974,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.8112326667131209,
"calib/avg_num_step_conf": 5.34375,
"calib/ece": 0.19073611111111122,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.5333333333333333,
"calib/gap": 0.41783127772745216,
"calib/mean_conf": 0.681013888888889,
"calib/mu_c": 0.8777427821522309,
"calib/mu_w": 0.45991150442477874,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.1712916666666668,
"calib/std_conf": 0.3895297970246458,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5331184407796102,
"calib/step_q_c_n": 667.0,
"calib/step_q_gap": 0.1487104521918784,
"calib/step_q_w": 0.38440798858773184,
"calib/step_q_w_n": 701.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2726.0,
"completions/max_terminated_length": 2726.0,
"completions/mean_length": 575.1015625,
"completions/mean_terminated_length": 577.3568725585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.04241366311907768,
"kl": 0.11263275146484375,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.024,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.029185505583882332,
"mask/share_reasoning": 0.8618265986442566,
"mask/share_step_conf": 0.10508161783218384,
"num_tokens": 44163884.0,
"reward": 0.8901246786117554,
"reward_std": 0.2551291584968567,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7246838212013245,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7719718217849731,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.7126970291137695,
"adv/mean_abs_reasoning": 0.5148271918296814,
"adv/mean_abs_step_conf": 0.7509140968322754,
"adv/ratio_final_to_reasoning": 1.3843422422597071,
"adv/ratio_step_to_reasoning": 1.4585750495492433,
"adv/std_final_conf": 0.8954695463180542,
"adv/std_reasoning": 0.7754849195480347,
"adv/std_step_conf": 0.9343070387840271,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7468113975576662,
"calib/avg_num_step_conf": 5.828125,
"calib/ece": 0.2148360655737705,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.3423364993215739,
"calib/mean_conf": 0.6474590163934427,
"calib/mu_c": 0.8017910447761194,
"calib/mu_w": 0.4594545454545455,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.15655737704918035,
"calib/std_conf": 0.39706414701523246,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5039824120603015,
"calib/step_q_c_n": 796.0,
"calib/step_q_gap": 0.10428413619823246,
"calib/step_q_w": 0.399698275862069,
"calib/step_q_w_n": 696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2154.0,
"completions/max_terminated_length": 2154.0,
"completions/mean_length": 559.98046875,
"completions/mean_terminated_length": 562.176513671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.1984,
"grad_norm": 0.0349576398730278,
"kl": 0.1226654052734375,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0252,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.02924538031220436,
"mask/share_reasoning": 0.8526691198348999,
"mask/share_step_conf": 0.11417928338050842,
"num_tokens": 44412279.0,
"reward": 0.9230327010154724,
"reward_std": 0.2111315280199051,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7192398309707642,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8315130472183228,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.7762144804000854,
"adv/mean_abs_reasoning": 0.705559492111206,
"adv/mean_abs_step_conf": 0.7481155395507812,
"adv/ratio_final_to_reasoning": 1.1001403695632561,
"adv/ratio_step_to_reasoning": 1.0603153212668674,
"adv/std_final_conf": 0.9334425926208496,
"adv/std_reasoning": 0.8903596997261047,
"adv/std_step_conf": 0.9350556135177612,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6508196721311476,
"calib/avg_num_step_conf": 5.796875,
"calib/ece": 0.29305785123966943,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.4834710743801653,
"calib/gap": 0.18424316939890717,
"calib/mean_conf": 0.6740495867768596,
"calib/mu_c": 0.7654098360655739,
"calib/mu_w": 0.5811666666666667,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2314876033057851,
"calib/std_conf": 0.37280504402024484,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.52334375,
"calib/step_q_c_n": 640.0,
"calib/step_q_gap": 0.12317787322274881,
"calib/step_q_w": 0.4001658767772512,
"calib/step_q_w_n": 844.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2330.0,
"completions/max_terminated_length": 2330.0,
"completions/mean_length": 618.01953125,
"completions/mean_terminated_length": 618.01953125,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.034964174032211304,
"kl": 0.1192626953125,
"learning_rate": 3.611111111111111e-07,
"loss": 0.043,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.027607450261712074,
"mask/share_reasoning": 0.8714186549186707,
"mask/share_step_conf": 0.10097391903400421,
"num_tokens": 44672036.0,
"reward": 0.8439823389053345,
"reward_std": 0.2684392035007477,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.6231184005737305,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7828149199485779,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.732832670211792,
"adv/mean_abs_reasoning": 0.6510793566703796,
"adv/mean_abs_step_conf": 0.7332849502563477,
"adv/ratio_final_to_reasoning": 1.1255658203625114,
"adv/ratio_step_to_reasoning": 1.1262604822956874,
"adv/std_final_conf": 0.9215667247772217,
"adv/std_reasoning": 0.8902121186256409,
"adv/std_step_conf": 0.9355761408805847,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7742424242424243,
"calib/avg_num_step_conf": 5.515625,
"calib/ece": 0.19988980716253446,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.47520661157024796,
"calib/gap": 0.36629797979797984,
"calib/mean_conf": 0.6526170798898072,
"calib/mu_c": 0.8191161616161616,
"calib/mu_w": 0.4528181818181818,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.15352617079889808,
"calib/std_conf": 0.39370417374071326,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5479680696661829,
"calib/step_q_c_n": 689.0,
"calib/step_q_gap": 0.14402616095249,
"calib/step_q_w": 0.40394190871369295,
"calib/step_q_w_n": 723.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2294.0,
"completions/max_terminated_length": 2294.0,
"completions/mean_length": 588.5859375,
"completions/mean_terminated_length": 593.220458984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.029144667088985443,
"kl": 0.112518310546875,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0333,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.029021695256233215,
"mask/share_reasoning": 0.8537598848342896,
"mask/share_step_conf": 0.10940589010715485,
"num_tokens": 44926786.0,
"reward": 0.8943223357200623,
"reward_std": 0.26499199867248535,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7098451852798462,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7897369265556335,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.6746816635131836,
"adv/mean_abs_reasoning": 0.500690758228302,
"adv/mean_abs_step_conf": 0.7490466833114624,
"adv/ratio_final_to_reasoning": 1.347501731209399,
"adv/ratio_step_to_reasoning": 1.4960265812813676,
"adv/std_final_conf": 0.8674516081809998,
"adv/std_reasoning": 0.7394338846206665,
"adv/std_step_conf": 0.9350956082344055,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.749208547685002,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.2093117408906882,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.41700404858299595,
"calib/gap": 0.35126566416040106,
"calib/mean_conf": 0.5776518218623482,
"calib/mu_c": 0.7397744360902256,
"calib/mu_w": 0.3885087719298246,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.12425101214574895,
"calib/std_conf": 0.4089823356048055,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5151793400286945,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.14206376213924726,
"calib/step_q_w": 0.3731155778894472,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1883.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 531.33984375,
"completions/mean_terminated_length": 533.423583984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.2016,
"grad_norm": 0.03737876936793327,
"kl": 0.137725830078125,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0113,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.030570272356271744,
"mask/share_reasoning": 0.8609588146209717,
"mask/share_step_conf": 0.10456467419862747,
"num_tokens": 45170577.0,
"reward": 0.9082374572753906,
"reward_std": 0.20283764600753784,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7151933312416077,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8059688806533813,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.7458434700965881,
"adv/mean_abs_reasoning": 0.6624995470046997,
"adv/mean_abs_step_conf": 0.7595421075820923,
"adv/ratio_final_to_reasoning": 1.12580223408258,
"adv/ratio_step_to_reasoning": 1.146479436878323,
"adv/std_final_conf": 0.9165393710136414,
"adv/std_reasoning": 0.8747674822807312,
"adv/std_step_conf": 0.9350847005844116,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.79893524409027,
"calib/avg_num_step_conf": 5.69140625,
"calib/ece": 0.14926829268292682,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.36585365853658536,
"calib/gap": 0.39336101252260086,
"calib/mean_conf": 0.5851219512195123,
"calib/mu_c": 0.7594160583941605,
"calib/mu_w": 0.3660550458715596,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.08873983739837399,
"calib/std_conf": 0.3849577441781069,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.49921896792189685,
"calib/step_q_c_n": 717.0,
"calib/step_q_gap": 0.14427302197595093,
"calib/step_q_w": 0.3549459459459459,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2179.0,
"completions/max_terminated_length": 2179.0,
"completions/mean_length": 589.5859375,
"completions/mean_terminated_length": 591.8980712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.07689023017883301,
"kl": 0.1240234375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0043,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.02882501296699047,
"mask/share_reasoning": 0.8632330894470215,
"mask/share_step_conf": 0.1040356308221817,
"num_tokens": 45427119.0,
"reward": 0.9469718933105469,
"reward_std": 0.21352709829807281,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7646961212158203,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8308101296424866,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.7412534356117249,
"adv/mean_abs_reasoning": 0.5375125408172607,
"adv/mean_abs_step_conf": 0.7689924240112305,
"adv/ratio_final_to_reasoning": 1.3790439837639628,
"adv/ratio_step_to_reasoning": 1.4306501999786205,
"adv/std_final_conf": 0.8983718752861023,
"adv/std_reasoning": 0.7928674817085266,
"adv/std_step_conf": 0.9350568652153015,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7611643281165679,
"calib/avg_num_step_conf": 6.05078125,
"calib/ece": 0.2628979591836735,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5510204081632653,
"calib/gap": 0.3385462763086886,
"calib/mean_conf": 0.6828979591836735,
"calib/mu_c": 0.8708256880733946,
"calib/mu_w": 0.532279411764706,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.25044897959183676,
"calib/std_conf": 0.38765460351772957,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5619716088328075,
"calib/step_q_c_n": 634.0,
"calib/step_q_gap": 0.15599346675630477,
"calib/step_q_w": 0.40597814207650273,
"calib/step_q_w_n": 915.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2208.0,
"completions/max_terminated_length": 2208.0,
"completions/mean_length": 563.8671875,
"completions/mean_terminated_length": 566.0784912109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.056538574397563934,
"kl": 0.1218719482421875,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0491,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.030824054032564163,
"mask/share_reasoning": 0.8450208902359009,
"mask/share_step_conf": 0.12024883925914764,
"num_tokens": 45675637.0,
"reward": 0.8770207166671753,
"reward_std": 0.24598871171474457,
"rewards/accuracy_reward_step": 0.4296875,
"rewards/final_brier_reward_step": 0.6788250207901001,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7994351387023926,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.7328107357025146,
"adv/mean_abs_reasoning": 0.6302567720413208,
"adv/mean_abs_step_conf": 0.7598267793655396,
"adv/ratio_final_to_reasoning": 1.1627177496705585,
"adv/ratio_step_to_reasoning": 1.2055828879149655,
"adv/std_final_conf": 0.9117442965507507,
"adv/std_reasoning": 0.843053936958313,
"adv/std_step_conf": 0.9347034692764282,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7464337604015322,
"calib/avg_num_step_conf": 5.1796875,
"calib/ece": 0.1904453441295546,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.44534412955465585,
"calib/gap": 0.36322480517765154,
"calib/mean_conf": 0.6183805668016195,
"calib/mu_c": 0.7845522388059701,
"calib/mu_w": 0.4213274336283186,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1331578947368421,
"calib/std_conf": 0.4011287324877253,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5290642458100558,
"calib/step_q_c_n": 716.0,
"calib/step_q_gap": 0.10758883597399022,
"calib/step_q_w": 0.42147540983606563,
"calib/step_q_w_n": 610.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2437.0,
"completions/max_terminated_length": 2437.0,
"completions/mean_length": 567.3671875,
"completions/mean_terminated_length": 569.5921630859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.2048,
"grad_norm": 0.04736936837434769,
"kl": 0.1262664794921875,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.0253,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.030947385355830193,
"mask/share_reasoning": 0.8560739159584045,
"mask/share_step_conf": 0.10907246917486191,
"num_tokens": 45925859.0,
"reward": 0.929151177406311,
"reward_std": 0.2332858443260193,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7307624816894531,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.832227349281311,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.7735573053359985,
"adv/mean_abs_reasoning": 0.6546435356140137,
"adv/mean_abs_step_conf": 0.7471445798873901,
"adv/ratio_final_to_reasoning": 1.1816465958232543,
"adv/ratio_step_to_reasoning": 1.1412998666314125,
"adv/std_final_conf": 0.9217936992645264,
"adv/std_reasoning": 0.8267903923988342,
"adv/std_step_conf": 0.9356114268302917,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6999547832827338,
"calib/avg_num_step_conf": 5.25,
"calib/ece": 0.23779599999999992,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.332,
"calib/gap": 0.24896563529487753,
"calib/mean_conf": 0.570204,
"calib/mu_c": 0.7066371681415928,
"calib/mu_w": 0.4576715328467153,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17799999999999994,
"calib/std_conf": 0.3786099343440422,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5068074324324324,
"calib/step_q_c_n": 592.0,
"calib/step_q_gap": 0.09998562392179411,
"calib/step_q_w": 0.4068218085106383,
"calib/step_q_w_n": 752.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1888.0,
"completions/max_terminated_length": 1888.0,
"completions/mean_length": 560.5546875,
"completions/mean_terminated_length": 562.7529907226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.03374604508280754,
"kl": 0.1214447021484375,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.0719,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.02906222455203533,
"mask/share_reasoning": 0.8611117601394653,
"mask/share_step_conf": 0.1059197410941124,
"num_tokens": 46175073.0,
"reward": 0.8976184725761414,
"reward_std": 0.24325111508369446,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7013315558433533,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8095303773880005,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.7530584335327148,
"adv/mean_abs_reasoning": 0.595274806022644,
"adv/mean_abs_step_conf": 0.7521034479141235,
"adv/ratio_final_to_reasoning": 1.2650601468661329,
"adv/ratio_step_to_reasoning": 1.2634558699692622,
"adv/std_final_conf": 0.919442355632782,
"adv/std_reasoning": 0.8100360035896301,
"adv/std_step_conf": 0.9344271421432495,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7624487967229903,
"calib/avg_num_step_conf": 5.046875,
"calib/ece": 0.21579640000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.468,
"calib/gap": 0.38410724526369705,
"calib/mean_conf": 0.6102835999999999,
"calib/mu_c": 0.8008007936507938,
"calib/mu_w": 0.41669354838709677,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.16104000000000004,
"calib/std_conf": 0.41774921042539387,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5432770061728395,
"calib/step_q_c_n": 648.0,
"calib/step_q_gap": 0.13268508070700097,
"calib/step_q_w": 0.4105919254658385,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1710.0,
"completions/max_terminated_length": 1710.0,
"completions/mean_length": 529.07421875,
"completions/mean_terminated_length": 529.07421875,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.04108177125453949,
"kl": 0.1233978271484375,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.0005,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03036157228052616,
"mask/share_reasoning": 0.864544153213501,
"mask/share_step_conf": 0.10509428381919861,
"num_tokens": 46416460.0,
"reward": 0.9311327934265137,
"reward_std": 0.2457653433084488,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7367550134658813,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8325417637825012,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.7122936844825745,
"adv/mean_abs_reasoning": 0.6348901391029358,
"adv/mean_abs_step_conf": 0.7122489213943481,
"adv/ratio_final_to_reasoning": 1.1219164397308259,
"adv/ratio_step_to_reasoning": 1.121845934480438,
"adv/std_final_conf": 0.9117943048477173,
"adv/std_reasoning": 0.8591422438621521,
"adv/std_step_conf": 0.9355720281600952,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.789870541247762,
"calib/avg_num_step_conf": 5.28125,
"calib/ece": 0.1592181069958848,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.42386831275720166,
"calib/gap": 0.41256369646054264,
"calib/mean_conf": 0.592880658436214,
"calib/mu_c": 0.7728467153284672,
"calib/mu_w": 0.3602830188679245,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.09415637860082306,
"calib/std_conf": 0.40700139805775315,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5230487804878049,
"calib/step_q_c_n": 738.0,
"calib/step_q_gap": 0.1276579010089775,
"calib/step_q_w": 0.3953908794788274,
"calib/step_q_w_n": 614.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3059.0,
"completions/max_terminated_length": 3059.0,
"completions/mean_length": 574.4609375,
"completions/mean_terminated_length": 574.4609375,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.208,
"grad_norm": 0.031227953732013702,
"kl": 0.122222900390625,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.0479,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.02997458539903164,
"mask/share_reasoning": 0.8616489171981812,
"mask/share_step_conf": 0.10837653279304504,
"num_tokens": 46669506.0,
"reward": 0.9046463966369629,
"reward_std": 0.2648537755012512,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7336082458496094,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.7834970951080322,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.7024111151695251,
"adv/mean_abs_reasoning": 0.5777875185012817,
"adv/mean_abs_step_conf": 0.7787004709243774,
"adv/ratio_final_to_reasoning": 1.21569105021774,
"adv/ratio_step_to_reasoning": 1.347728093788253,
"adv/std_final_conf": 0.9007546901702881,
"adv/std_reasoning": 0.8098658919334412,
"adv/std_step_conf": 0.9352697134017944,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7357881136950903,
"calib/avg_num_step_conf": 5.28125,
"calib/ece": 0.24082156862745094,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.5254901960784314,
"calib/gap": 0.31706367663344404,
"calib/mean_conf": 0.7101588235294118,
"calib/mu_c": 0.8668255813953489,
"calib/mu_w": 0.5497619047619049,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2225490196078431,
"calib/std_conf": 0.3690129591095315,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5717097791798108,
"calib/step_q_c_n": 634.0,
"calib/step_q_gap": 0.1130997513246576,
"calib/step_q_w": 0.4586100278551532,
"calib/step_q_w_n": 718.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1559.0,
"completions/max_terminated_length": 1559.0,
"completions/mean_length": 465.85546875,
"completions/mean_terminated_length": 467.682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.028292890638113022,
"kl": 0.14324951171875,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.031,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03412295877933502,
"mask/share_reasoning": 0.8412783145904541,
"mask/share_step_conf": 0.12069250643253326,
"num_tokens": 46891309.0,
"reward": 0.9191616773605347,
"reward_std": 0.21659040451049805,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.7205460667610168,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8209022283554077,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.7571711540222168,
"adv/mean_abs_reasoning": 0.5579431653022766,
"adv/mean_abs_step_conf": 0.7533473372459412,
"adv/ratio_final_to_reasoning": 1.357075776010276,
"adv/ratio_step_to_reasoning": 1.3502223597233252,
"adv/std_final_conf": 0.9181612133979797,
"adv/std_reasoning": 0.7928898334503174,
"adv/std_step_conf": 0.9353706240653992,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7829932851239669,
"calib/avg_num_step_conf": 5.61328125,
"calib/ece": 0.20963855421686742,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.41365461847389556,
"calib/gap": 0.3733709969008265,
"calib/mean_conf": 0.5954216867469879,
"calib/mu_c": 0.7873553719008265,
"calib/mu_w": 0.413984375,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.15955823293172686,
"calib/std_conf": 0.4018290547590611,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5099694189602446,
"calib/step_q_c_n": 654.0,
"calib/step_q_gap": 0.11003327592065332,
"calib/step_q_w": 0.3999361430395913,
"calib/step_q_w_n": 783.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1864.0,
"completions/max_terminated_length": 1864.0,
"completions/mean_length": 568.50390625,
"completions/mean_terminated_length": 568.50390625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.025571517646312714,
"kl": 0.12994384765625,
"learning_rate": 8.333333333333334e-08,
"loss": 0.0078,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.028912434354424477,
"mask/share_reasoning": 0.8599445819854736,
"mask/share_step_conf": 0.11114296317100525,
"num_tokens": 47141902.0,
"reward": 0.9254905581474304,
"reward_std": 0.2386348396539688,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7346000075340271,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8280998468399048,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.7414839267730713,
"adv/mean_abs_reasoning": 0.5541132688522339,
"adv/mean_abs_step_conf": 0.7658023238182068,
"adv/ratio_final_to_reasoning": 1.3381450480493795,
"adv/ratio_step_to_reasoning": 1.3820320986076662,
"adv/std_final_conf": 0.9041799306869507,
"adv/std_reasoning": 0.7755098342895508,
"adv/std_step_conf": 0.9351991415023804,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.8141909469302809,
"calib/avg_num_step_conf": 5.61328125,
"calib/ece": 0.16147849462365585,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4314516129032258,
"calib/gap": 0.4539784946236559,
"calib/mean_conf": 0.6096505376344086,
"calib/mu_c": 0.8366397849462365,
"calib/mu_w": 0.38266129032258056,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1355645161290322,
"calib/std_conf": 0.4050996201538763,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5397018813314037,
"calib/step_q_c_n": 691.0,
"calib/step_q_gap": 0.1324096561303314,
"calib/step_q_w": 0.40729222520107233,
"calib/step_q_w_n": 746.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1763.0,
"completions/max_terminated_length": 1763.0,
"completions/mean_length": 507.2734375,
"completions/mean_terminated_length": 507.2734375,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.2112,
"grad_norm": 0.03272956982254982,
"kl": 0.1305694580078125,
"learning_rate": 5.555555555555556e-08,
"loss": -0.0318,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.033308904618024826,
"mask/share_reasoning": 0.8394599556922913,
"mask/share_step_conf": 0.127231165766716,
"num_tokens": 47377148.0,
"reward": 0.9434477090835571,
"reward_std": 0.203046053647995,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.7719271183013916,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8235619068145752,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.794552206993103,
"adv/mean_abs_reasoning": 0.6170809864997864,
"adv/mean_abs_step_conf": 0.7630276679992676,
"adv/ratio_final_to_reasoning": 1.2875979399397328,
"adv/ratio_step_to_reasoning": 1.2365113894163577,
"adv/std_final_conf": 0.9217703342437744,
"adv/std_reasoning": 0.8431092500686646,
"adv/std_step_conf": 0.9354465007781982,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.69229709246245,
"calib/avg_num_step_conf": 5.203125,
"calib/ece": 0.2812033195020747,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.4730290456431535,
"calib/gap": 0.22925795783381575,
"calib/mean_conf": 0.6927385892116182,
"calib/mu_c": 0.8097457627118645,
"calib/mu_w": 0.5804878048780487,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.24215767634854776,
"calib/std_conf": 0.3642794999324539,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.554266441821248,
"calib/step_q_c_n": 593.0,
"calib/step_q_gap": 0.1362556163814645,
"calib/step_q_w": 0.41801082543978346,
"calib/step_q_w_n": 739.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2462.0,
"completions/max_terminated_length": 2462.0,
"completions/mean_length": 583.765625,
"completions/mean_terminated_length": 583.765625,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.04038415104150772,
"kl": 0.1252593994140625,
"learning_rate": 2.777777777777778e-08,
"loss": -0.0451,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.030942562967538834,
"mask/share_reasoning": 0.8595815300941467,
"mask/share_step_conf": 0.10947591811418533,
"num_tokens": 47630792.0,
"reward": 0.8547995090484619,
"reward_std": 0.28671619296073914,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6443082094192505,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7863845825195312,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.6942980289459229,
"adv/mean_abs_reasoning": 0.4747547507286072,
"adv/mean_abs_step_conf": 0.7476445436477661,
"adv/ratio_final_to_reasoning": 1.4624351370478803,
"adv/ratio_step_to_reasoning": 1.57480160546125,
"adv/std_final_conf": 0.884438693523407,
"adv/std_reasoning": 0.7394700646400452,
"adv/std_step_conf": 0.9350288510322571,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.8390983153824909,
"calib/avg_num_step_conf": 5.54296875,
"calib/ece": 0.16172131147540994,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5368852459016393,
"calib/gap": 0.45574150787075374,
"calib/mean_conf": 0.6743442622950819,
"calib/mu_c": 0.8648591549295773,
"calib/mu_w": 0.4091176470588236,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.12704918032786897,
"calib/std_conf": 0.39565568676196067,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5899313186813188,
"calib/step_q_c_n": 728.0,
"calib/step_q_gap": 0.22475042143095697,
"calib/step_q_w": 0.3651808972503618,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2415.0,
"completions/max_terminated_length": 2415.0,
"completions/mean_length": 578.265625,
"completions/mean_terminated_length": 580.5333862304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.03473372384905815,
"kl": 0.1212615966796875,
"learning_rate": 0.0,
"loss": 0.0451,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.028863336890935898,
"mask/share_reasoning": 0.8620182275772095,
"mask/share_step_conf": 0.10521218180656433,
"num_tokens": 47886876.0,
"reward": 0.9555322527885437,
"reward_std": 0.22584620118141174,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7749546766281128,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8361097574234009,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": 74.20911938300357,
"train_runtime": 12329.5189,
"train_samples_per_second": 4.153,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 47886876,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}