Files
PureRL-1.5B-v7-s2-l2-kl-w1-b2/trainer_state.json
ModelHub XC a9e6e2b437 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/PureRL-1.5B-v7-s2-l2-kl-w1-b2
Source: Original Platform
2026-06-04 16:21:47 +08:00

11843 lines
476 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"adv/mean_abs_final_conf": 0.773959219455719,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7490277290344238,
"adv/std_final_conf": 0.9294352531433105,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343300461769104,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.38076182006817844,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.2003187250996017,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.2948207171314741,
"calib/gap": -0.026059730250481805,
"calib/mean_conf": 0.8737051792828686,
"calib/mu_c": 0.865606936416185,
"calib/mu_w": 0.8916666666666668,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.19239043824701207,
"calib/std_conf": 0.09027744273295583,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7959393232205367,
"calib/step_q_c_n": 857.0,
"calib/step_q_gap": -0.006446568895645877,
"calib/step_q_w": 0.8023858921161826,
"calib/step_q_w_n": 482.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 474.94921875,
"completions/mean_terminated_length": 478.68896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.04268835112452507,
"kl": 0.000291675329208374,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.078,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03466901555657387,
"mask/share_reasoning": 0.8340686559677124,
"mask/share_step_conf": 0.12344987690448761,
"num_tokens": 229171.0,
"reward": 0.7281402349472046,
"reward_std": 0.16804265975952148,
"rewards/accuracy_reward_step": 0.67578125,
"rewards/final_brier_reward_step": 0.7142800688743591,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7420004606246948,
"step": 1
},
{
"adv/mean_abs_final_conf": 0.7672724723815918,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7698483467102051,
"adv/std_final_conf": 0.9330522418022156,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345317482948303,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.44343065693430656,
"calib/avg_num_step_conf": 5.05859375,
"calib/ece": 0.3349411764705883,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.2823529411764706,
"calib/gap": 0.002352468143016151,
"calib/mean_conf": 0.8721960784313726,
"calib/mu_c": 0.8732846715328467,
"calib/mu_w": 0.8709322033898306,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3349411764705883,
"calib/std_conf": 0.07627016470309335,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7954391371340525,
"calib/step_q_c_n": 649.0,
"calib/step_q_gap": 0.011011892552009073,
"calib/step_q_w": 0.7844272445820434,
"calib/step_q_w_n": 646.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 492.9765625,
"completions/mean_terminated_length": 494.9098205566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.03963535279035568,
"kl": 0.00037539005279541016,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0095,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03364308178424835,
"mask/share_reasoning": 0.8523939251899719,
"mask/share_step_conf": 0.11005672812461853,
"num_tokens": 458661.0,
"reward": 0.6806339025497437,
"reward_std": 0.16487614810466766,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6320762038230896,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7291916012763977,
"step": 2
},
{
"adv/mean_abs_final_conf": 0.8019323348999023,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.781822681427002,
"adv/std_final_conf": 0.9299672245979309,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9331517219543457,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.4182875264270613,
"calib/avg_num_step_conf": 4.8671875,
"calib/ece": 0.22414342629482076,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.29880478087649404,
"calib/gap": -0.007272022551092383,
"calib/mean_conf": 0.8795219123505977,
"calib/mu_c": 0.877030303030303,
"calib/mu_w": 0.8843023255813954,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.22314741035856578,
"calib/std_conf": 0.051840339838680916,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7927968337730872,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.03380093213374291,
"calib/step_q_w": 0.7589959016393443,
"calib/step_q_w_n": 488.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2538.0,
"completions/max_terminated_length": 2538.0,
"completions/mean_length": 496.19921875,
"completions/mean_terminated_length": 500.1062927246094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0032,
"grad_norm": 0.04587876424193382,
"kl": 0.0007779151201248169,
"learning_rate": 7.5e-07,
"loss": 0.0432,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032924000173807144,
"mask/share_reasoning": 0.8506391048431396,
"mask/share_step_conf": 0.1086243987083435,
"num_tokens": 690944.0,
"reward": 0.7217806577682495,
"reward_std": 0.1670541763305664,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7043820023536682,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.739179253578186,
"step": 3
},
{
"adv/mean_abs_final_conf": 0.7670461535453796,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7715258002281189,
"adv/std_final_conf": 0.9306488037109375,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346135854721069,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.4428854875283447,
"calib/avg_num_step_conf": 5.16015625,
"calib/ece": 0.20373015873015893,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.23412698412698413,
"calib/gap": -0.0038690476190477163,
"calib/mean_conf": 0.8703968253968254,
"calib/mu_c": 0.8691071428571429,
"calib/mu_w": 0.8729761904761906,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20373015873015893,
"calib/std_conf": 0.04819604135839693,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.796887573964497,
"calib/step_q_c_n": 845.0,
"calib/step_q_gap": 0.0004169857292029011,
"calib/step_q_w": 0.7964705882352942,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2324.0,
"completions/max_terminated_length": 2324.0,
"completions/mean_length": 523.91015625,
"completions/mean_terminated_length": 525.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.04513922333717346,
"kl": 0.00026550889015197754,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0224,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.032802172005176544,
"mask/share_reasoning": 0.8497129082679749,
"mask/share_step_conf": 0.1135786697268486,
"num_tokens": 931233.0,
"reward": 0.7231439352035522,
"reward_std": 0.152303084731102,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.720788300037384,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7254995107650757,
"step": 4
},
{
"adv/mean_abs_final_conf": 0.7628365159034729,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7605997323989868,
"adv/std_final_conf": 0.9310415387153625,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9340444207191467,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.4282638481111763,
"calib/avg_num_step_conf": 4.75390625,
"calib/ece": 0.3553225806451613,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.2903225806451613,
"calib/gap": -0.010919292751353837,
"calib/mean_conf": 0.8835483870967742,
"calib/mu_c": 0.8783969465648854,
"calib/mu_w": 0.8893162393162393,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3553225806451613,
"calib/std_conf": 0.04231589246917829,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7984055727554179,
"calib/step_q_c_n": 646.0,
"calib/step_q_gap": 0.012661264524244542,
"calib/step_q_w": 0.7857443082311734,
"calib/step_q_w_n": 571.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2651.0,
"completions/max_terminated_length": 2651.0,
"completions/mean_length": 532.23828125,
"completions/mean_terminated_length": 534.3255004882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.03690319508314133,
"kl": 0.0002651214599609375,
"learning_rate": 1.25e-06,
"loss": -0.0964,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.034227386116981506,
"mask/share_reasoning": 0.8531880974769592,
"mask/share_step_conf": 0.10867826640605927,
"num_tokens": 1174174.0,
"reward": 0.6350550651550293,
"reward_std": 0.16529521346092224,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5958515405654907,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.6742585301399231,
"step": 5
},
{
"adv/mean_abs_final_conf": 0.7556248307228088,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7537124156951904,
"adv/std_final_conf": 0.9313980340957642,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342355728149414,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5419920624759954,
"calib/avg_num_step_conf": 5.26171875,
"calib/ece": 0.30031620553359695,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.25691699604743085,
"calib/gap": 0.0024433491230316795,
"calib/mean_conf": 0.8773913043478262,
"calib/mu_c": 0.8784246575342466,
"calib/mu_w": 0.8759813084112149,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.30031620553359695,
"calib/std_conf": 0.04182016179178139,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.8026478873239438,
"calib/step_q_c_n": 710.0,
"calib/step_q_gap": 0.0031502421120127577,
"calib/step_q_w": 0.799497645211931,
"calib/step_q_w_n": 637.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2382.0,
"completions/max_terminated_length": 2382.0,
"completions/mean_length": 445.99609375,
"completions/mean_terminated_length": 447.7451171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.0064,
"grad_norm": 0.041909750550985336,
"kl": 0.0006876289844512939,
"learning_rate": 1.5e-06,
"loss": 0.0061,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037554264068603516,
"mask/share_reasoning": 0.8299777507781982,
"mask/share_step_conf": 0.12856173515319824,
"num_tokens": 1394301.0,
"reward": 0.6814709901809692,
"reward_std": 0.15129441022872925,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6562492251396179,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7066926956176758,
"step": 6
},
{
"adv/mean_abs_final_conf": 0.7541855573654175,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7434177994728088,
"adv/std_final_conf": 0.9303765296936035,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9332534074783325,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.43873786407766985,
"calib/avg_num_step_conf": 5.625,
"calib/ece": 0.2913833992094862,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.30039525691699603,
"calib/gap": -0.009607766990291533,
"calib/mean_conf": 0.8807114624505928,
"calib/mu_c": 0.8768,
"calib/mu_w": 0.8864077669902916,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.289604743083004,
"calib/std_conf": 0.044262598383373285,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.7931042382588774,
"calib/step_q_c_n": 873.0,
"calib/step_q_gap": 0.019700358188330713,
"calib/step_q_w": 0.7734038800705467,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2309.0,
"completions/max_terminated_length": 2309.0,
"completions/mean_length": 547.6875,
"completions/mean_terminated_length": 549.8353271484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.04376210644841194,
"kl": 0.000276029109954834,
"learning_rate": 1.75e-06,
"loss": 0.0142,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.0305500365793705,
"mask/share_reasoning": 0.850243091583252,
"mask/share_step_conf": 0.11530055105686188,
"num_tokens": 1641933.0,
"reward": 0.6913090348243713,
"reward_std": 0.156791090965271,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.6555039286613464,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7271141409873962,
"step": 7
},
{
"adv/mean_abs_final_conf": 0.7654787302017212,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7579349279403687,
"adv/std_final_conf": 0.9322072863578796,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352673888206482,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.527547938008931,
"calib/avg_num_step_conf": 4.5390625,
"calib/ece": 0.3118473895582329,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.3132530120481928,
"calib/gap": 0.01203703703703729,
"calib/mean_conf": 0.8781124497991968,
"calib/mu_c": 0.8833333333333334,
"calib/mu_w": 0.8712962962962961,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.3118473895582329,
"calib/std_conf": 0.05985299432225467,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.7923771790808241,
"calib/step_q_c_n": 631.0,
"calib/step_q_gap": 0.02527736740474118,
"calib/step_q_w": 0.7670998116760829,
"calib/step_q_w_n": 531.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2666.0,
"completions/max_terminated_length": 2666.0,
"completions/mean_length": 518.8828125,
"completions/mean_terminated_length": 522.968505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.04647514969110489,
"kl": 0.00046569108963012695,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.055,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032357074320316315,
"mask/share_reasoning": 0.8551695346832275,
"mask/share_step_conf": 0.10466089844703674,
"num_tokens": 1881279.0,
"reward": 0.6843916177749634,
"reward_std": 0.1819065511226654,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.6338390707969666,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.734944224357605,
"step": 8
},
{
"adv/mean_abs_final_conf": 0.775026798248291,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7733014822006226,
"adv/std_final_conf": 0.9304593205451965,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345585107803345,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.5300694444444444,
"calib/avg_num_step_conf": 4.99609375,
"calib/ece": 0.2850819672131148,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.2581967213114754,
"calib/gap": 0.006669444444444506,
"calib/mean_conf": 0.8748360655737704,
"calib/mu_c": 0.8775694444444445,
"calib/mu_w": 0.8709,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.2848770491803279,
"calib/std_conf": 0.051418144506781754,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.7824285714285714,
"calib/step_q_c_n": 630.0,
"calib/step_q_gap": 0.09888465771516608,
"calib/step_q_w": 0.6835439137134053,
"calib/step_q_w_n": 649.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2715.0,
"completions/max_terminated_length": 2715.0,
"completions/mean_length": 512.48046875,
"completions/mean_terminated_length": 520.6151123046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.0096,
"grad_norm": 0.04125397652387619,
"kl": 0.0003396272659301758,
"learning_rate": 2.25e-06,
"loss": -0.0476,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.033868737518787384,
"mask/share_reasoning": 0.8458621501922607,
"mask/share_step_conf": 0.10464408993721008,
"num_tokens": 2120010.0,
"reward": 0.6498497724533081,
"reward_std": 0.19337286055088043,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6333246231079102,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.6663750410079956,
"step": 9
},
{
"adv/mean_abs_final_conf": 0.761988639831543,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7337956428527832,
"adv/std_final_conf": 0.9305899143218994,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9335968494415283,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.4558519437551695,
"calib/avg_num_step_conf": 5.41796875,
"calib/ece": 0.2638955823293173,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3654618473895582,
"calib/gap": -0.008000413564929976,
"calib/mean_conf": 0.8863855421686747,
"calib/mu_c": 0.8833974358974357,
"calib/mu_w": 0.8913978494623657,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.2618875502008033,
"calib/std_conf": 0.041126066737147415,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7831216931216931,
"calib/step_q_c_n": 756.0,
"calib/step_q_gap": 0.010296522080759263,
"calib/step_q_w": 0.7728251710409338,
"calib/step_q_w_n": 631.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 534.75,
"completions/mean_terminated_length": 538.9606323242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.049111757427453995,
"kl": 0.00047522783279418945,
"learning_rate": 2.5e-06,
"loss": -0.001,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032005734741687775,
"mask/share_reasoning": 0.848203718662262,
"mask/share_step_conf": 0.11197805404663086,
"num_tokens": 2363706.0,
"reward": 0.6990261077880859,
"reward_std": 0.16522833704948425,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6740808486938477,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.723971426486969,
"step": 10
},
{
"adv/mean_abs_final_conf": 0.7775152325630188,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7695873975753784,
"adv/std_final_conf": 0.9275996685028076,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341156482696533,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.4470198675496689,
"calib/avg_num_step_conf": 5.37109375,
"calib/ece": 0.3026171875,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.38671875,
"calib/gap": -0.009318826868495722,
"calib/mean_conf": 0.8887890625000001,
"calib/mu_c": 0.8849668874172185,
"calib/mu_w": 0.8942857142857142,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.30078125,
"calib/std_conf": 0.048304236412255744,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7787966751918158,
"calib/step_q_c_n": 782.0,
"calib/step_q_gap": -0.018420862750848532,
"calib/step_q_w": 0.7972175379426644,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1342.0,
"completions/max_terminated_length": 1342.0,
"completions/mean_length": 501.5546875,
"completions/mean_terminated_length": 503.5216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.03449160233139992,
"kl": 0.0007895231246948242,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0331,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03353225439786911,
"mask/share_reasoning": 0.8452185392379761,
"mask/share_step_conf": 0.117343008518219,
"num_tokens": 2596584.0,
"reward": 0.6882535219192505,
"reward_std": 0.13917958736419678,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6580113172531128,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.718495786190033,
"step": 11
},
{
"adv/mean_abs_final_conf": 0.7754501104354858,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7735542058944702,
"adv/std_final_conf": 0.9258955717086792,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9328970909118652,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.5133865993832352,
"calib/avg_num_step_conf": 5.60546875,
"calib/ece": 0.22546875000000002,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.46484375,
"calib/gap": 0.0020493411830670993,
"calib/mean_conf": 0.8927343750000001,
"calib/mu_c": 0.8933908045977013,
"calib/mu_w": 0.8913414634146342,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.21925781250000004,
"calib/std_conf": 0.05257754457331928,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7640579710144928,
"calib/step_q_c_n": 897.0,
"calib/step_q_gap": 0.008463175475459384,
"calib/step_q_w": 0.7555947955390334,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1486.0,
"completions/max_terminated_length": 1486.0,
"completions/mean_length": 463.76953125,
"completions/mean_terminated_length": 465.5882568359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.0128,
"grad_norm": 0.0531257800757885,
"kl": 0.0015319585800170898,
"learning_rate": 3e-06,
"loss": 0.0205,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.035761505365371704,
"mask/share_reasoning": 0.8305474519729614,
"mask/share_step_conf": 0.12978483736515045,
"num_tokens": 2819485.0,
"reward": 0.7614898681640625,
"reward_std": 0.13899016380310059,
"rewards/accuracy_reward_step": 0.6796875,
"rewards/final_brier_reward_step": 0.7311863303184509,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7917934060096741,
"step": 12
},
{
"adv/mean_abs_final_conf": 0.7574409246444702,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7679467797279358,
"adv/std_final_conf": 0.9247254729270935,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341288208961487,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5054759060135403,
"calib/avg_num_step_conf": 4.984375,
"calib/ece": 0.2670980392156863,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.5098039215686274,
"calib/gap": 0.007996814018319531,
"calib/mean_conf": 0.9023921568627451,
"calib/mu_c": 0.9053086419753088,
"calib/mu_w": 0.8973118279569893,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2670980392156863,
"calib/std_conf": 0.047898617783230665,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.7763171355498721,
"calib/step_q_c_n": 782.0,
"calib/step_q_gap": 0.02666126510452793,
"calib/step_q_w": 0.7496558704453442,
"calib/step_q_w_n": 494.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1364.0,
"completions/max_terminated_length": 1364.0,
"completions/mean_length": 478.09765625,
"completions/mean_terminated_length": 479.9725646972656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.047136712819337845,
"kl": 0.0021216869354248047,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0671,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.034416116774082184,
"mask/share_reasoning": 0.8466586470603943,
"mask/share_step_conf": 0.11501900106668472,
"num_tokens": 3046470.0,
"reward": 0.7384437918663025,
"reward_std": 0.15288321673870087,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6956464648246765,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7812410593032837,
"step": 13
},
{
"adv/mean_abs_final_conf": 0.7636485695838928,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7400187253952026,
"adv/std_final_conf": 0.921985387802124,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344073534011841,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5393435251798562,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.36896414342629474,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.7450199203187251,
"calib/gap": 0.006576310380267181,
"calib/mean_conf": 0.9227490039840637,
"calib/mu_c": 0.92568345323741,
"calib/mu_w": 0.9191071428571428,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36896414342629474,
"calib/std_conf": 0.037871831833412944,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.7489583333333333,
"calib/step_q_c_n": 720.0,
"calib/step_q_gap": 0.04890191584391157,
"calib/step_q_w": 0.7000564174894217,
"calib/step_q_w_n": 709.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2782.0,
"completions/max_terminated_length": 2782.0,
"completions/mean_length": 549.0,
"completions/mean_terminated_length": 553.3228149414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.04201479256153107,
"kl": 0.00584101676940918,
"learning_rate": 3.5e-06,
"loss": -0.0607,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03314138203859329,
"mask/share_reasoning": 0.8421472907066345,
"mask/share_step_conf": 0.11689884215593338,
"num_tokens": 3292414.0,
"reward": 0.6788654327392578,
"reward_std": 0.13546247780323029,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6064925789833069,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7512382864952087,
"step": 14
},
{
"adv/mean_abs_final_conf": 0.7457209825515747,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7551430463790894,
"adv/std_final_conf": 0.9170833826065063,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934794545173645,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5344794094794094,
"calib/avg_num_step_conf": 4.9453125,
"calib/ece": 0.35776892430278884,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8207171314741036,
"calib/gap": 0.013309375809375745,
"calib/mean_conf": 0.9274900398406375,
"calib/mu_c": 0.9332167832167831,
"calib/mu_w": 0.9199074074074074,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.35776892430278884,
"calib/std_conf": 0.052168846510283806,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.7192296918767507,
"calib/step_q_c_n": 714.0,
"calib/step_q_gap": -0.004067409572524716,
"calib/step_q_w": 0.7232971014492754,
"calib/step_q_w_n": 552.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2105.0,
"completions/max_terminated_length": 2105.0,
"completions/mean_length": 475.109375,
"completions/mean_terminated_length": 480.74310302734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.016,
"grad_norm": 0.041417043656110764,
"kl": 0.008565902709960938,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0614,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034135397523641586,
"mask/share_reasoning": 0.8391706943511963,
"mask/share_step_conf": 0.11497519910335541,
"num_tokens": 3521922.0,
"reward": 0.6850520372390747,
"reward_std": 0.1696966141462326,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6183484196662903,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7517555952072144,
"step": 15
},
{
"adv/mean_abs_final_conf": 0.7481696605682373,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7608056664466858,
"adv/std_final_conf": 0.9169109463691711,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348391890525818,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5255022637238257,
"calib/avg_num_step_conf": 6.68359375,
"calib/ece": 0.31497959183673474,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8734693877551021,
"calib/gap": 0.016136813808715234,
"calib/mean_conf": 0.9353877551020408,
"calib/mu_c": 0.9415131578947369,
"calib/mu_w": 0.9253763440860217,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.31497959183673474,
"calib/std_conf": 0.07009871067157779,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6679646936656283,
"calib/step_q_c_n": 963.0,
"calib/step_q_gap": 0.0389941054303341,
"calib/step_q_w": 0.6289705882352942,
"calib/step_q_w_n": 748.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3000.0,
"completions/max_terminated_length": 3000.0,
"completions/mean_length": 693.61328125,
"completions/mean_terminated_length": 696.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.04022593051195145,
"kl": 0.008114814758300781,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0603,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.025023385882377625,
"mask/share_reasoning": 0.8619957566261292,
"mask/share_step_conf": 0.10907462239265442,
"num_tokens": 3808335.0,
"reward": 0.6982525587081909,
"reward_std": 0.16592827439308167,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6386706829071045,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7578344941139221,
"step": 16
},
{
"adv/mean_abs_final_conf": 0.7530292868614197,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7690562009811401,
"adv/std_final_conf": 0.9123719334602356,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345831871032715,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.43895716945996277,
"calib/avg_num_step_conf": 5.515625,
"calib/ece": 0.2460236220472442,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.8937007874015748,
"calib/gap": -0.010618249534450785,
"calib/mean_conf": 0.9418503937007875,
"calib/mu_c": 0.9387150837988826,
"calib/mu_w": 0.9493333333333334,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.24157480314960642,
"calib/std_conf": 0.04497726087600256,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6559919839679358,
"calib/step_q_c_n": 998.0,
"calib/step_q_gap": 0.0018132400065831167,
"calib/step_q_w": 0.6541787439613527,
"calib/step_q_w_n": 414.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2057.0,
"completions/max_terminated_length": 2057.0,
"completions/mean_length": 516.7421875,
"completions/mean_terminated_length": 516.7421875,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.03731020539999008,
"kl": 0.012533187866210938,
"learning_rate": 4.25e-06,
"loss": 0.0001,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.033470265567302704,
"mask/share_reasoning": 0.8462820053100586,
"mask/share_step_conf": 0.12024769186973572,
"num_tokens": 4044149.0,
"reward": 0.7667979001998901,
"reward_std": 0.16370335221290588,
"rewards/accuracy_reward_step": 0.69921875,
"rewards/final_brier_reward_step": 0.7154378890991211,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8181577920913696,
"step": 17
},
{
"adv/mean_abs_final_conf": 0.7296743392944336,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7674551010131836,
"adv/std_final_conf": 0.9048014879226685,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343299865722656,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5590599279835391,
"calib/avg_num_step_conf": 4.94921875,
"calib/ece": 0.3760317460317461,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9325396825396826,
"calib/gap": 0.010787037037037095,
"calib/mean_conf": 0.9474603174603174,
"calib/mu_c": 0.9520833333333335,
"calib/mu_w": 0.9412962962962964,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.3760317460317461,
"calib/std_conf": 0.04386010422646486,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.6303954802259887,
"calib/step_q_c_n": 708.0,
"calib/step_q_gap": -0.013862122636265295,
"calib/step_q_w": 0.644257602862254,
"calib/step_q_w_n": 559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2879.0,
"completions/max_terminated_length": 2879.0,
"completions/mean_length": 514.765625,
"completions/mean_terminated_length": 516.7843627929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.0192,
"grad_norm": 0.03311552107334137,
"kl": 0.0136260986328125,
"learning_rate": 4.5e-06,
"loss": -0.068,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03235187008976936,
"mask/share_reasoning": 0.8596428632736206,
"mask/share_step_conf": 0.10409902781248093,
"num_tokens": 4286649.0,
"reward": 0.6812572479248047,
"reward_std": 0.13839876651763916,
"rewards/accuracy_reward_step": 0.5625,
"rewards/final_brier_reward_step": 0.6026082038879395,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7599062323570251,
"step": 18
},
{
"adv/mean_abs_final_conf": 0.7078334093093872,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7547190189361572,
"adv/std_final_conf": 0.9071928262710571,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344522953033447,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5825134461498097,
"calib/avg_num_step_conf": 4.734375,
"calib/ece": 0.34913043478260863,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9723320158102767,
"calib/gap": 0.005721500721500572,
"calib/mean_conf": 0.9578260869565217,
"calib/mu_c": 0.9600649350649348,
"calib/mu_w": 0.9543434343434343,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.34913043478260863,
"calib/std_conf": 0.02437392728569622,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.6258414464534076,
"calib/step_q_c_n": 719.0,
"calib/step_q_gap": 0.05862035111872199,
"calib/step_q_w": 0.5672210953346856,
"calib/step_q_w_n": 493.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2525.0,
"completions/max_terminated_length": 2525.0,
"completions/mean_length": 500.71875,
"completions/mean_terminated_length": 500.71875,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.03164170682430267,
"kl": 0.018739700317382812,
"learning_rate": 4.75e-06,
"loss": -0.0468,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032328102737665176,
"mask/share_reasoning": 0.8583135604858398,
"mask/share_step_conf": 0.10935834795236588,
"num_tokens": 4519593.0,
"reward": 0.712796688079834,
"reward_std": 0.13104841113090515,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6345300674438477,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7910631895065308,
"step": 19
},
{
"adv/mean_abs_final_conf": 0.6984207630157471,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7432424426078796,
"adv/std_final_conf": 0.8983197212219238,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347612261772156,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.45215311004784686,
"calib/avg_num_step_conf": 5.484375,
"calib/ece": 0.3531075697211156,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9800796812749004,
"calib/gap": 0.01472953216374251,
"calib/mean_conf": 0.957808764940239,
"calib/mu_c": 0.9636184210526314,
"calib/mu_w": 0.9488888888888889,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.35266932270916346,
"calib/std_conf": 0.08103156683400753,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5916091954022988,
"calib/step_q_c_n": 783.0,
"calib/step_q_gap": 0.012994058526292429,
"calib/step_q_w": 0.5786151368760064,
"calib/step_q_w_n": 621.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2925.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 488.15234375,
"completions/mean_terminated_length": 488.15234375,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.034622061997652054,
"kl": 0.027156829833984375,
"learning_rate": 5e-06,
"loss": -0.0,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03587206080555916,
"mask/share_reasoning": 0.8324520587921143,
"mask/share_step_conf": 0.1316758692264557,
"num_tokens": 4749432.0,
"reward": 0.7167103886604309,
"reward_std": 0.14828336238861084,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6250980496406555,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8083226680755615,
"step": 20
},
{
"adv/mean_abs_final_conf": 0.7363002896308899,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7545459270477295,
"adv/std_final_conf": 0.8944153189659119,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350616931915283,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.520718954248366,
"calib/avg_num_step_conf": 6.04296875,
"calib/ece": 0.3646640316205533,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.9841897233201581,
"calib/gap": 0.0015000000000002789,
"calib/mean_conf": 0.9694071146245059,
"calib/mu_c": 0.9700000000000002,
"calib/mu_w": 0.9684999999999999,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.3646640316205533,
"calib/std_conf": 0.019028901172082977,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5519160997732426,
"calib/step_q_c_n": 882.0,
"calib/step_q_gap": 0.031510084735648536,
"calib/step_q_w": 0.5204060150375941,
"calib/step_q_w_n": 665.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2625.0,
"completions/max_terminated_length": 2625.0,
"completions/mean_length": 515.29296875,
"completions/mean_terminated_length": 517.3137817382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.0224,
"grad_norm": 0.04176400601863861,
"kl": 0.032192230224609375,
"learning_rate": 4.9722222222222224e-06,
"loss": -0.0224,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03280477970838547,
"mask/share_reasoning": 0.8365024328231812,
"mask/share_step_conf": 0.1267865151166916,
"num_tokens": 4984307.0,
"reward": 0.7157672643661499,
"reward_std": 0.18473777174949646,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6209827661514282,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.8105518221855164,
"step": 21
},
{
"adv/mean_abs_final_conf": 0.6889126300811768,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7624020576477051,
"adv/std_final_conf": 0.8601471185684204,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344401955604553,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.40761486103233124,
"calib/avg_num_step_conf": 5.62890625,
"calib/ece": 0.2945275590551182,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.9921875,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": -0.004523539421440792,
"calib/mean_conf": 0.9716929133858269,
"calib/mu_c": 0.9702325581395349,
"calib/mu_w": 0.9747560975609757,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.2945275590551182,
"calib/std_conf": 0.014110370236287091,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5819481865284973,
"calib/step_q_c_n": 965.0,
"calib/step_q_gap": 0.027935581486480454,
"calib/step_q_w": 0.5540126050420169,
"calib/step_q_w_n": 476.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2181.0,
"completions/max_terminated_length": 2181.0,
"completions/mean_length": 477.93359375,
"completions/mean_terminated_length": 479.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.0354311503469944,
"kl": 0.048328399658203125,
"learning_rate": 4.944444444444445e-06,
"loss": -0.0143,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03266695886850357,
"mask/share_reasoning": 0.836262583732605,
"mask/share_step_conf": 0.12716418504714966,
"num_tokens": 5208474.0,
"reward": 0.7574120759963989,
"reward_std": 0.13079439103603363,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6870542764663696,
"rewards/format_reward_step": 0.9921875,
"rewards/step_l2_reward": 0.8277699947357178,
"step": 22
},
{
"adv/mean_abs_final_conf": 0.7558117508888245,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7470569610595703,
"adv/std_final_conf": 0.8796916007995605,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351850152015686,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.4980074719800748,
"calib/avg_num_step_conf": 5.39453125,
"calib/ece": 0.40570312500000005,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 1.0,
"calib/frac_conf_gt_0.9": 0.99609375,
"calib/gap": -0.0002914072229142084,
"calib/mean_conf": 0.976015625,
"calib/mu_c": 0.975890410958904,
"calib/mu_w": 0.9761818181818183,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.40570312500000005,
"calib/std_conf": 0.014188455020169575,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5975,
"calib/step_q_c_n": 784.0,
"calib/step_q_gap": 0.04490368509212739,
"calib/step_q_w": 0.5525963149078726,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1453.0,
"completions/max_terminated_length": 1453.0,
"completions/mean_length": 495.65625,
"completions/mean_terminated_length": 497.60003662109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.026641665026545525,
"kl": 0.04611968994140625,
"learning_rate": 4.9166666666666665e-06,
"loss": -0.0015,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03485388308763504,
"mask/share_reasoning": 0.8375639319419861,
"mask/share_step_conf": 0.12367593497037888,
"num_tokens": 5439298.0,
"reward": 0.6970131397247314,
"reward_std": 0.16669629514217377,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.5900046825408936,
"rewards/format_reward_step": 1.0,
"rewards/step_l2_reward": 0.8040215969085693,
"step": 23
},
{
"adv/mean_abs_final_conf": 0.7947186827659607,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7730585932731628,
"adv/std_final_conf": 0.9157706499099731,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354721307754517,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5589953271028038,
"calib/avg_num_step_conf": 6.30859375,
"calib/ece": 0.5504780876494024,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": 0.003018561786085283,
"calib/mean_conf": 0.976772908366534,
"calib/mu_c": 0.9785046728971963,
"calib/mu_w": 0.975486111111111,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.5504780876494024,
"calib/std_conf": 0.013071372536486728,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5927873563218391,
"calib/step_q_c_n": 696.0,
"calib/step_q_gap": 0.01646526709441798,
"calib/step_q_w": 0.5763220892274211,
"calib/step_q_w_n": 919.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2747.0,
"completions/max_terminated_length": 2747.0,
"completions/mean_length": 578.1171875,
"completions/mean_terminated_length": 582.6693115234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.0256,
"grad_norm": 0.024851465597748756,
"kl": 0.027318954467773438,
"learning_rate": 4.888888888888889e-06,
"loss": -0.0613,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.030539821833372116,
"mask/share_reasoning": 0.8392899036407471,
"mask/share_step_conf": 0.1223578006029129,
"num_tokens": 5691808.0,
"reward": 0.6098623275756836,
"reward_std": 0.20302972197532654,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.4448503851890564,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.774874210357666,
"step": 24
},
{
"adv/mean_abs_final_conf": 0.6968058347702026,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7300499677658081,
"adv/std_final_conf": 0.8484832644462585,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348785281181335,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5112973760932945,
"calib/avg_num_step_conf": 5.7734375,
"calib/ece": 0.3676587301587303,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0004916512059368605,
"calib/mean_conf": 0.9787698412698415,
"calib/mu_c": 0.9789610389610389,
"calib/mu_w": 0.9784693877551021,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3676587301587303,
"calib/std_conf": 0.012711488803198966,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5837976190476191,
"calib/step_q_c_n": 840.0,
"calib/step_q_gap": 0.009502948201224104,
"calib/step_q_w": 0.574294670846395,
"calib/step_q_w_n": 638.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2611.0,
"completions/max_terminated_length": 2611.0,
"completions/mean_length": 515.44921875,
"completions/mean_terminated_length": 517.4706420898438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.02434094063937664,
"kl": 0.033573150634765625,
"learning_rate": 4.861111111111111e-06,
"loss": -0.0468,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03205115348100662,
"mask/share_reasoning": 0.8405336141586304,
"mask/share_step_conf": 0.12350897490978241,
"num_tokens": 5926987.0,
"reward": 0.7052030563354492,
"reward_std": 0.15304076671600342,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6174441576004028,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7929619550704956,
"step": 25
},
{
"adv/mean_abs_final_conf": 0.701289176940918,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7613296508789062,
"adv/std_final_conf": 0.8713129758834839,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348834753036499,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.4972997299729973,
"calib/avg_num_step_conf": 5.22265625,
"calib/ece": 0.36984313725490203,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.996078431372549,
"calib/gap": 0.009512665552269817,
"calib/mean_conf": 0.973764705882353,
"calib/mu_c": 0.9775324675324677,
"calib/mu_w": 0.9680198019801979,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.36984313725490203,
"calib/std_conf": 0.062295497031516586,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.62498687664042,
"calib/step_q_c_n": 762.0,
"calib/step_q_gap": 0.0509868766404199,
"calib/step_q_w": 0.5740000000000001,
"calib/step_q_w_n": 575.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2347.0,
"completions/max_terminated_length": 2347.0,
"completions/mean_length": 508.30859375,
"completions/mean_terminated_length": 508.30859375,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.027733333333333332,
"grad_norm": 73.61032104492188,
"kl": 64.53376007080078,
"learning_rate": 4.833333333333333e-06,
"loss": 1.4161,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.030653975903987885,
"mask/share_reasoning": 0.857771635055542,
"mask/share_step_conf": 0.11157442629337311,
"num_tokens": 6162354.0,
"reward": 0.7190060615539551,
"reward_std": 0.17579255998134613,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.6222456693649292,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8157663345336914,
"step": 26
},
{
"adv/mean_abs_final_conf": 0.7280699610710144,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7500325441360474,
"adv/std_final_conf": 0.878308117389679,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935150682926178,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5284929950776222,
"calib/avg_num_step_conf": 6.078125,
"calib/ece": 0.4282608695652175,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 1.0,
"calib/gap": 0.0016628802221377859,
"calib/mean_conf": 0.9776679841897234,
"calib/mu_c": 0.9784172661870502,
"calib/mu_w": 0.9767543859649124,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.4282608695652175,
"calib/std_conf": 0.012842018279301592,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5972738853503184,
"calib/step_q_c_n": 785.0,
"calib/step_q_gap": 0.036378943716077194,
"calib/step_q_w": 0.5608949416342413,
"calib/step_q_w_n": 771.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2678.0,
"completions/max_terminated_length": 2678.0,
"completions/mean_length": 519.99609375,
"completions/mean_terminated_length": 519.99609375,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.0288,
"grad_norm": 0.027362557128071785,
"kl": 0.040985107421875,
"learning_rate": 4.805555555555556e-06,
"loss": -0.0143,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.032479990273714066,
"mask/share_reasoning": 0.8387442827224731,
"mask/share_step_conf": 0.1287757158279419,
"num_tokens": 6400689.0,
"reward": 0.6784756779670715,
"reward_std": 0.18861247599124908,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.5627851486206055,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.7941662073135376,
"step": 27
},
{
"adv/mean_abs_final_conf": 0.692762017250061,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7368833422660828,
"adv/std_final_conf": 0.8757468461990356,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355268478393555,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5933370427041313,
"calib/avg_num_step_conf": 5.3203125,
"calib/ece": 0.3396385542168675,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.9879518072289156,
"calib/gap": 0.0077945472249270065,
"calib/mean_conf": 0.9741767068273092,
"calib/mu_c": 0.9770253164556962,
"calib/mu_w": 0.9692307692307692,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.3396385542168675,
"calib/std_conf": 0.021458489384508266,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.6009316037735849,
"calib/step_q_c_n": 848.0,
"calib/step_q_gap": 0.0021378294545186804,
"calib/step_q_w": 0.5987937743190662,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2963.0,
"completions/max_terminated_length": 2963.0,
"completions/mean_length": 549.1171875,
"completions/mean_terminated_length": 560.0557861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.044755179435014725,
"kl": 0.041538238525390625,
"learning_rate": 4.777777777777778e-06,
"loss": -0.1011,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03004935197532177,
"mask/share_reasoning": 0.8406248688697815,
"mask/share_step_conf": 0.10979451984167099,
"num_tokens": 6648207.0,
"reward": 0.7052626609802246,
"reward_std": 0.18966689705848694,
"rewards/accuracy_reward_step": 0.6171875,
"rewards/final_brier_reward_step": 0.6301574110984802,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7803678512573242,
"step": 28
},
{
"adv/mean_abs_final_conf": 0.7540074586868286,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7608869671821594,
"adv/std_final_conf": 0.8825486898422241,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354591965675354,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5462503183091417,
"calib/avg_num_step_conf": 5.76953125,
"calib/ece": 0.5014741035856575,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.9960159362549801,
"calib/gap": 0.009847211611917595,
"calib/mean_conf": 0.975577689243028,
"calib/mu_c": 0.9807563025210084,
"calib/mu_w": 0.9709090909090908,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.5014741035856575,
"calib/std_conf": 0.06301809121954761,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5873322932917316,
"calib/step_q_c_n": 641.0,
"calib/step_q_gap": 0.027404063626659814,
"calib/step_q_w": 0.5599282296650718,
"calib/step_q_w_n": 836.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2605.0,
"completions/max_terminated_length": 2605.0,
"completions/mean_length": 580.1484375,
"completions/mean_terminated_length": 582.423583984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.02169017121195793,
"kl": 0.042827606201171875,
"learning_rate": 4.75e-06,
"loss": -0.0789,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.028678342700004578,
"mask/share_reasoning": 0.8571142554283142,
"mask/share_step_conf": 0.1103011816740036,
"num_tokens": 6903853.0,
"reward": 0.6350164413452148,
"reward_std": 0.1887853443622589,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.49021053314208984,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7798223495483398,
"step": 29
},
{
"adv/mean_abs_final_conf": 0.7241463661193848,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7513007521629333,
"adv/std_final_conf": 0.8834677934646606,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353786706924438,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.5153054313964514,
"calib/avg_num_step_conf": 6.20703125,
"calib/ece": 0.4110612244897961,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.9836734693877551,
"calib/gap": 0.009838141676824952,
"calib/mean_conf": 0.9743265306122451,
"calib/mu_c": 0.978623188405797,
"calib/mu_w": 0.968785046728972,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4110612244897961,
"calib/std_conf": 0.0643147197278946,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5532292917166867,
"calib/step_q_c_n": 833.0,
"calib/step_q_gap": 0.019089080076474918,
"calib/step_q_w": 0.5341402116402117,
"calib/step_q_w_n": 756.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2930.0,
"completions/max_terminated_length": 2930.0,
"completions/mean_length": 615.69140625,
"completions/mean_terminated_length": 622.9921264648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.032,
"grad_norm": 0.034742482006549835,
"kl": 0.05080413818359375,
"learning_rate": 4.722222222222222e-06,
"loss": -0.0564,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.028462983667850494,
"mask/share_reasoning": 0.8461865186691284,
"mask/share_step_conf": 0.11363177001476288,
"num_tokens": 7168454.0,
"reward": 0.6671479344367981,
"reward_std": 0.20367759466171265,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.560566782951355,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.7737290859222412,
"step": 30
},
{
"adv/mean_abs_final_conf": 0.7788076996803284,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7677104473114014,
"adv/std_final_conf": 0.9012274742126465,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355896711349487,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5034076015727391,
"calib/avg_num_step_conf": 6.3046875,
"calib/ece": 0.5326104417670683,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.9839357429718876,
"calib/gap": 0.014532110091743156,
"calib/mean_conf": 0.9703614457831325,
"calib/mu_c": 0.978532110091743,
"calib/mu_w": 0.9639999999999999,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.5326104417670683,
"calib/std_conf": 0.0843321857283264,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5646365105008078,
"calib/step_q_c_n": 619.0,
"calib/step_q_gap": 0.0229179175359836,
"calib/step_q_w": 0.5417185929648242,
"calib/step_q_w_n": 995.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2236.0,
"completions/max_terminated_length": 2236.0,
"completions/mean_length": 607.578125,
"completions/mean_terminated_length": 607.578125,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.02423461340367794,
"kl": 0.048610687255859375,
"learning_rate": 4.694444444444445e-06,
"loss": -0.0788,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.02809649333357811,
"mask/share_reasoning": 0.8581934571266174,
"mask/share_step_conf": 0.11371007561683655,
"num_tokens": 7429906.0,
"reward": 0.6151062250137329,
"reward_std": 0.20114478468894958,
"rewards/accuracy_reward_step": 0.42578125,
"rewards/final_brier_reward_step": 0.4573843777179718,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7728281021118164,
"step": 31
},
{
"adv/mean_abs_final_conf": 0.7334926128387451,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7424031496047974,
"adv/std_final_conf": 0.8949069976806641,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356457591056824,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5813483288052064,
"calib/avg_num_step_conf": 5.86328125,
"calib/ece": 0.49710317460317455,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9722222222222222,
"calib/gap": 0.017518796992481378,
"calib/mean_conf": 0.9693253968253969,
"calib/mu_c": 0.9785714285714286,
"calib/mu_w": 0.9610526315789473,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.49710317460317455,
"calib/std_conf": 0.07284429244836445,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5945141065830721,
"calib/step_q_c_n": 638.0,
"calib/step_q_gap": 0.09094516104425399,
"calib/step_q_w": 0.5035689455388181,
"calib/step_q_w_n": 863.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2739.0,
"completions/max_terminated_length": 2739.0,
"completions/mean_length": 579.6484375,
"completions/mean_terminated_length": 579.6484375,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.02419840358197689,
"kl": 0.06121063232421875,
"learning_rate": 4.666666666666667e-06,
"loss": -0.0403,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.0297107957303524,
"mask/share_reasoning": 0.8531237840652466,
"mask/share_step_conf": 0.11716543138027191,
"num_tokens": 7685000.0,
"reward": 0.6433032751083374,
"reward_std": 0.17889225482940674,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.4952601194381714,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7913463115692139,
"step": 32
},
{
"adv/mean_abs_final_conf": 0.7403093576431274,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7863280773162842,
"adv/std_final_conf": 0.8979626893997192,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354382753372192,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5144976606747106,
"calib/avg_num_step_conf": 5.63671875,
"calib/ece": 0.46631372549019623,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9882352941176471,
"calib/gap": -0.005614996306328446,
"calib/mean_conf": 0.9721960784313727,
"calib/mu_c": 0.9694656488549619,
"calib/mu_w": 0.9750806451612903,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.46239215686274526,
"calib/std_conf": 0.0633808963634705,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.5588917861799217,
"calib/step_q_c_n": 767.0,
"calib/step_q_gap": -0.024762059973924333,
"calib/step_q_w": 0.5836538461538461,
"calib/step_q_w_n": 676.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1362.0,
"completions/max_terminated_length": 1362.0,
"completions/mean_length": 530.28125,
"completions/mean_terminated_length": 532.36083984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0352,
"grad_norm": 0.026419516652822495,
"kl": 0.07160186767578125,
"learning_rate": 4.638888888888889e-06,
"loss": -0.0381,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03006555698812008,
"mask/share_reasoning": 0.852064847946167,
"mask/share_step_conf": 0.11396338045597076,
"num_tokens": 7927624.0,
"reward": 0.6580392122268677,
"reward_std": 0.17391237616539001,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.5310878753662109,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.7849905490875244,
"step": 33
},
{
"adv/mean_abs_final_conf": 0.7404600977897644,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7718697786331177,
"adv/std_final_conf": 0.9089685678482056,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351857304573059,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.48072519083969467,
"calib/avg_num_step_conf": 6.2109375,
"calib/ece": 0.4719521912350599,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.9601593625498008,
"calib/gap": -0.03308524173028016,
"calib/mean_conf": 0.9585657370517928,
"calib/mu_c": 0.9427480916030534,
"calib/mu_w": 0.9758333333333336,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4543027888446216,
"calib/std_conf": 0.11797568230315518,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.49260292164674635,
"calib/step_q_c_n": 753.0,
"calib/step_q_gap": -0.03952372112505764,
"calib/step_q_w": 0.532126642771804,
"calib/step_q_w_n": 837.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3045.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 527.76171875,
"completions/mean_terminated_length": 527.76171875,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.01992679014801979,
"kl": 0.0793304443359375,
"learning_rate": 4.611111111111112e-06,
"loss": -0.0357,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032113149762153625,
"mask/share_reasoning": 0.8370362520217896,
"mask/share_step_conf": 0.13085061311721802,
"num_tokens": 8167843.0,
"reward": 0.6504442691802979,
"reward_std": 0.2038143277168274,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.519045352935791,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7818431854248047,
"step": 34
},
{
"adv/mean_abs_final_conf": 0.7602913975715637,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7359387278556824,
"adv/std_final_conf": 0.9223473072052002,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9359014630317688,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.5150700025926885,
"calib/avg_num_step_conf": 5.97265625,
"calib/ece": 0.43694779116465876,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.9317269076305221,
"calib/gap": -0.0009625356494685766,
"calib/mean_conf": 0.9481927710843374,
"calib/mu_c": 0.9477443609022558,
"calib/mu_w": 0.9487068965517244,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.4255020080321287,
"calib/std_conf": 0.129554174477126,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5164872139973082,
"calib/step_q_c_n": 743.0,
"calib/step_q_gap": 0.024947773793745853,
"calib/step_q_w": 0.4915394402035624,
"calib/step_q_w_n": 786.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2838.0,
"completions/max_terminated_length": 2838.0,
"completions/mean_length": 609.65234375,
"completions/mean_terminated_length": 612.0431518554688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.08677566051483154,
"kl": 0.20432281494140625,
"learning_rate": 4.583333333333333e-06,
"loss": -0.0317,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.027661994099617004,
"mask/share_reasoning": 0.8633013963699341,
"mask/share_step_conf": 0.10513034462928772,
"num_tokens": 8433170.0,
"reward": 0.6564799547195435,
"reward_std": 0.23788407444953918,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.5431902408599854,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7697696685791016,
"step": 35
},
{
"adv/mean_abs_final_conf": 0.7184458374977112,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7714102864265442,
"adv/std_final_conf": 0.9145676493644714,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353185892105103,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.5131339310731298,
"calib/avg_num_step_conf": 6.1875,
"calib/ece": 0.30407843137254903,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.99609375,
"calib/frac_conf_gt_0.9": 0.9137254901960784,
"calib/gap": -0.0005365648641073584,
"calib/mean_conf": 0.9516862745098039,
"calib/mu_c": 0.9515116279069767,
"calib/mu_w": 0.9520481927710841,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.29062745098039217,
"calib/std_conf": 0.11270601880270643,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.47419541015625,
"calib/step_q_c_n": 1024.0,
"calib/step_q_gap": 0.007927553013392818,
"calib/step_q_w": 0.4662678571428572,
"calib/step_q_w_n": 560.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2181.0,
"completions/max_terminated_length": 2181.0,
"completions/mean_length": 502.25390625,
"completions/mean_terminated_length": 504.22357177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.0384,
"grad_norm": 0.02859259396791458,
"kl": 0.09160614013671875,
"learning_rate": 4.555555555555556e-06,
"loss": -0.0312,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03455417603254318,
"mask/share_reasoning": 0.8255020380020142,
"mask/share_step_conf": 0.13603754341602325,
"num_tokens": 8664459.0,
"reward": 0.7533528804779053,
"reward_std": 0.16682732105255127,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.6879905462265015,
"rewards/format_reward_step": 0.99609375,
"rewards/step_l2_reward": 0.8187150955200195,
"step": 36
},
{
"adv/mean_abs_final_conf": 0.724345326423645,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.752377986907959,
"adv/std_final_conf": 0.9264432787895203,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355803728103638,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.4804136011135415,
"calib/avg_num_step_conf": 5.5859375,
"calib/ece": 0.53625,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.8508064516129032,
"calib/gap": -0.027505136872804425,
"calib/mean_conf": 0.9165725806451614,
"calib/mu_c": 0.9009345794392523,
"calib/mu_w": 0.9284397163120567,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.5106854838709678,
"calib/std_conf": 0.19489644369333167,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4986245353159851,
"calib/step_q_c_n": 538.0,
"calib/step_q_gap": 0.06208866087652326,
"calib/step_q_w": 0.43653587443946185,
"calib/step_q_w_n": 892.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2615.0,
"completions/max_terminated_length": 2615.0,
"completions/mean_length": 559.3046875,
"completions/mean_terminated_length": 563.7086791992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.03272354602813721,
"kl": 0.09531402587890625,
"learning_rate": 4.527777777777778e-06,
"loss": -0.1342,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.030177973210811615,
"mask/share_reasoning": 0.8478702306747437,
"mask/share_step_conf": 0.11413928866386414,
"num_tokens": 8914737.0,
"reward": 0.6202419400215149,
"reward_std": 0.1850229799747467,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.4493539035320282,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7911299467086792,
"step": 37
},
{
"adv/mean_abs_final_conf": 0.7505814433097839,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7393084764480591,
"adv/std_final_conf": 0.9262279272079468,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.93584144115448,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5682171044119549,
"calib/avg_num_step_conf": 5.70703125,
"calib/ece": 0.4629192771084339,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.8634538152610441,
"calib/gap": 0.05123264329149957,
"calib/mean_conf": 0.9118598393574298,
"calib/mu_c": 0.9388135593220339,
"calib/mu_w": 0.8875809160305344,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.4504417670682732,
"calib/std_conf": 0.20639150264249348,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.493696682464455,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.006098977150445273,
"calib/step_q_w": 0.4875977053140097,
"calib/step_q_w_n": 828.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2836.0,
"completions/max_terminated_length": 2836.0,
"completions/mean_length": 550.43359375,
"completions/mean_terminated_length": 550.43359375,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.029129181057214737,
"kl": 0.09613037109375,
"learning_rate": 4.5e-06,
"loss": -0.0535,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.031951624900102615,
"mask/share_reasoning": 0.8504153490066528,
"mask/share_step_conf": 0.11763307452201843,
"num_tokens": 9162536.0,
"reward": 0.6530201435089111,
"reward_std": 0.2201344519853592,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.5270024538040161,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7790378332138062,
"step": 38
},
{
"adv/mean_abs_final_conf": 0.7539465427398682,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.761415958404541,
"adv/std_final_conf": 0.9165918231010437,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355933666229248,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5276138003410731,
"calib/avg_num_step_conf": 5.8046875,
"calib/ece": 0.45404453441295556,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.8421052631578947,
"calib/gap": 0.03396035681490217,
"calib/mean_conf": 0.9127570850202429,
"calib/mu_c": 0.930080991735537,
"calib/mu_w": 0.8961206349206349,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.43846153846153857,
"calib/std_conf": 0.197062096967272,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.522846,
"calib/step_q_c_n": 600.0,
"calib/step_q_gap": 0.054166541760722386,
"calib/step_q_w": 0.46867945823927765,
"calib/step_q_w_n": 886.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2590.0,
"completions/max_terminated_length": 2590.0,
"completions/mean_length": 561.53125,
"completions/mean_terminated_length": 568.1897583007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.0416,
"grad_norm": 0.031767114996910095,
"kl": 0.0828094482421875,
"learning_rate": 4.472222222222223e-06,
"loss": -0.0377,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.030686449259519577,
"mask/share_reasoning": 0.842280924320221,
"mask/share_step_conf": 0.11531387269496918,
"num_tokens": 9412376.0,
"reward": 0.6522339582443237,
"reward_std": 0.20904475450515747,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.5222196578979492,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7822482585906982,
"step": 39
},
{
"adv/mean_abs_final_conf": 0.7848721742630005,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7495676279067993,
"adv/std_final_conf": 0.9360496997833252,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935839056968689,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5932433305342032,
"calib/avg_num_step_conf": 5.140625,
"calib/ece": 0.4544000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.78,
"calib/gap": 0.06007880627866424,
"calib/mean_conf": 0.89336,
"calib/mu_c": 0.926283185840708,
"calib/mu_w": 0.8662043795620438,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.44788000000000006,
"calib/std_conf": 0.2023756665214472,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5128762430939227,
"calib/step_q_c_n": 543.0,
"calib/step_q_gap": 0.039638080092629036,
"calib/step_q_w": 0.4732381630012936,
"calib/step_q_w_n": 773.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2302.0,
"completions/max_terminated_length": 2302.0,
"completions/mean_length": 548.6015625,
"completions/mean_terminated_length": 550.7529907226562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.030587203800678253,
"kl": 0.096893310546875,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0833,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.030954282730817795,
"mask/share_reasoning": 0.8567135334014893,
"mask/share_step_conf": 0.10842593014240265,
"num_tokens": 9659578.0,
"reward": 0.6597231030464172,
"reward_std": 0.23681029677391052,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.5329078435897827,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.7865383625030518,
"step": 40
},
{
"adv/mean_abs_final_conf": 0.770974338054657,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7481893301010132,
"adv/std_final_conf": 0.9226452708244324,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354609251022339,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5879452458051222,
"calib/avg_num_step_conf": 5.359375,
"calib/ece": 0.2507525896414342,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6812749003984063,
"calib/gap": 0.08494085222254932,
"calib/mean_conf": 0.8374545816733069,
"calib/mu_c": 0.8641889534883721,
"calib/mu_w": 0.7792481012658228,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.20147410358565732,
"calib/std_conf": 0.2666757460938386,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.491091425389755,
"calib/step_q_c_n": 898.0,
"calib/step_q_gap": 0.041407881085957554,
"calib/step_q_w": 0.4496835443037974,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2399.0,
"completions/max_terminated_length": 2399.0,
"completions/mean_length": 494.796875,
"completions/mean_terminated_length": 496.7372741699219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.02226729691028595,
"kl": 0.100555419921875,
"learning_rate": 4.416666666666667e-06,
"loss": -0.048,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.033622778952121735,
"mask/share_reasoning": 0.8409379720687866,
"mask/share_step_conf": 0.12153302878141403,
"num_tokens": 9893494.0,
"reward": 0.7606412172317505,
"reward_std": 0.19762994349002838,
"rewards/accuracy_reward_step": 0.671875,
"rewards/final_brier_reward_step": 0.7124884128570557,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.8087941408157349,
"step": 41
},
{
"adv/mean_abs_final_conf": 0.7752968072891235,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.755486249923706,
"adv/std_final_conf": 0.9295274019241333,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355602264404297,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.5258946572580645,
"calib/avg_num_step_conf": 5.93359375,
"calib/ece": 0.38066547619047625,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.98046875,
"calib/frac_conf_gt_0.9": 0.6309523809523809,
"calib/gap": 0.04844412802419351,
"calib/mean_conf": 0.8299694444444443,
"calib/mu_c": 0.85380703125,
"calib/mu_w": 0.8053629032258065,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 1.0,
"calib/pce": 0.3513492063492064,
"calib/std_conf": 0.25918747776435896,
"calib/step_conf_rate": 1.0,
"calib/step_q_c": 0.4970636734693878,
"calib/step_q_c_n": 735.0,
"calib/step_q_gap": -0.0015517602040815426,
"calib/step_q_w": 0.49861543367346933,
"calib/step_q_w_n": 784.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2375.0,
"completions/max_terminated_length": 2375.0,
"completions/mean_length": 471.75,
"completions/mean_terminated_length": 471.75,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.0448,
"grad_norm": 0.0334896519780159,
"kl": 0.1055450439453125,
"learning_rate": 4.388888888888889e-06,
"loss": 0.018,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03502010926604271,
"mask/share_reasoning": 0.8282299041748047,
"mask/share_step_conf": 0.1367499828338623,
"num_tokens": 10118630.0,
"reward": 0.6918591856956482,
"reward_std": 0.21222494542598724,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.5937366485595703,
"rewards/format_reward_step": 0.98046875,
"rewards/step_l2_reward": 0.7899817824363708,
"step": 42
},
{
"adv/mean_abs_final_conf": 0.8061068058013916,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7791420817375183,
"adv/std_final_conf": 0.9365590214729309,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353040456771851,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5696798493408664,
"calib/avg_num_step_conf": 5.34765625,
"calib/ece": 0.32089328063241107,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.98828125,
"calib/frac_conf_gt_0.9": 0.5019762845849802,
"calib/gap": 0.07902419962335205,
"calib/mean_conf": 0.744798418972332,
"calib/mu_c": 0.7816555555555554,
"calib/mu_w": 0.7026313559322034,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.2660474308300395,
"calib/std_conf": 0.31904407131952894,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.4950938858695652,
"calib/step_q_c_n": 736.0,
"calib/step_q_gap": 0.012943648902740579,
"calib/step_q_w": 0.48215023696682463,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2073.0,
"completions/max_terminated_length": 2073.0,
"completions/mean_length": 536.25,
"completions/mean_terminated_length": 536.25,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.028792526572942734,
"kl": 0.0963134765625,
"learning_rate": 4.361111111111112e-06,
"loss": -0.0078,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03177189826965332,
"mask/share_reasoning": 0.851236879825592,
"mask/share_step_conf": 0.11699119210243225,
"num_tokens": 10361134.0,
"reward": 0.715941309928894,
"reward_std": 0.21710166335105896,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.6365195512771606,
"rewards/format_reward_step": 0.98828125,
"rewards/step_l2_reward": 0.7953630685806274,
"step": 43
},
{
"adv/mean_abs_final_conf": 0.8082307577133179,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7397419214248657,
"adv/std_final_conf": 0.9357187151908875,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356873631477356,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.5607762017336485,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.3218108433734941,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.41767068273092367,
"calib/gap": 0.09593624901497244,
"calib/mean_conf": 0.6958598393574298,
"calib/mu_c": 0.7501851851851852,
"calib/mu_w": 0.6542489361702127,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.29196787148594383,
"calib/std_conf": 0.3225989883784285,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4841393264840182,
"calib/step_q_c_n": 584.0,
"calib/step_q_gap": 0.02785838453440148,
"calib/step_q_w": 0.4562809419496167,
"calib/step_q_w_n": 913.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2823.0,
"completions/max_terminated_length": 2823.0,
"completions/mean_length": 581.10546875,
"completions/mean_terminated_length": 583.3843383789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.03097447007894516,
"kl": 0.0913848876953125,
"learning_rate": 4.333333333333334e-06,
"loss": -0.0729,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.0295084398239851,
"mask/share_reasoning": 0.8522259593009949,
"mask/share_step_conf": 0.11435935646295547,
"num_tokens": 10616217.0,
"reward": 0.6976090669631958,
"reward_std": 0.22840039432048798,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.6077798008918762,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7874383926391602,
"step": 44
},
{
"adv/mean_abs_final_conf": 0.806146502494812,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.773840606212616,
"adv/std_final_conf": 0.9366689920425415,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935440719127655,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.555874840357599,
"calib/avg_num_step_conf": 5.625,
"calib/ece": 0.28786334661354573,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.32270916334661354,
"calib/gap": 0.06492851213282225,
"calib/mean_conf": 0.606080876494024,
"calib/mu_c": 0.6410025862068964,
"calib/mu_w": 0.5760740740740742,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.21589641434262938,
"calib/std_conf": 0.33849399559315413,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.49986407766990293,
"calib/step_q_c_n": 618.0,
"calib/step_q_gap": 0.028646072803722944,
"calib/step_q_w": 0.47121800486618,
"calib/step_q_w_n": 822.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2510.0,
"completions/max_terminated_length": 2510.0,
"completions/mean_length": 525.578125,
"completions/mean_terminated_length": 527.6392211914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.048,
"grad_norm": 0.027358626946806908,
"kl": 0.101593017578125,
"learning_rate": 4.305555555555556e-06,
"loss": -0.0421,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03319074958562851,
"mask/share_reasoning": 0.8404501080513,
"mask/share_step_conf": 0.12245286256074905,
"num_tokens": 10855813.0,
"reward": 0.7107663750648499,
"reward_std": 0.21518194675445557,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.6325352787971497,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.7889974117279053,
"step": 45
},
{
"adv/mean_abs_final_conf": 0.8161130547523499,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.78526771068573,
"adv/std_final_conf": 0.9366722702980042,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356869459152222,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5504032258064516,
"calib/avg_num_step_conf": 6.5234375,
"calib/ece": 0.2506731707317073,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.22764227642276422,
"calib/gap": 0.06345403225806456,
"calib/mean_conf": 0.5604650406504067,
"calib/mu_c": 0.59245,
"calib/mu_w": 0.5289959677419355,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.15760162601626018,
"calib/std_conf": 0.3357974991030459,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.46490219941348976,
"calib/step_q_c_n": 682.0,
"calib/step_q_gap": 0.09222669334061528,
"calib/step_q_w": 0.3726755060728745,
"calib/step_q_w_n": 988.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2462.0,
"completions/max_terminated_length": 2462.0,
"completions/mean_length": 579.35546875,
"completions/mean_terminated_length": 581.6275024414062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.023970698937773705,
"kl": 0.090087890625,
"learning_rate": 4.277777777777778e-06,
"loss": -0.0945,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03279152512550354,
"mask/share_reasoning": 0.8372425436973572,
"mask/share_step_conf": 0.1260596662759781,
"num_tokens": 11108896.0,
"reward": 0.7093468904495239,
"reward_std": 0.2259354442358017,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6335732936859131,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7851204872131348,
"step": 46
},
{
"adv/mean_abs_final_conf": 0.8137375116348267,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7501428127288818,
"adv/std_final_conf": 0.9365950226783752,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354432821273804,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5766758186566664,
"calib/avg_num_step_conf": 5.625,
"calib/ece": 0.24800772357723574,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.14634146341463414,
"calib/gap": 0.08106213754771313,
"calib/mean_conf": 0.5145939024390244,
"calib/mu_c": 0.5505116788321168,
"calib/mu_w": 0.4694495412844037,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.10284552845528454,
"calib/std_conf": 0.3287003753442299,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.44728381962864716,
"calib/step_q_c_n": 754.0,
"calib/step_q_gap": 0.04672740563447808,
"calib/step_q_w": 0.4005564139941691,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 560.16796875,
"completions/mean_terminated_length": 562.36474609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.02703959122300148,
"kl": 0.1092376708984375,
"learning_rate": 4.25e-06,
"loss": -0.1382,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.030245723202824593,
"mask/share_reasoning": 0.8493208885192871,
"mask/share_step_conf": 0.11652719229459763,
"num_tokens": 11358275.0,
"reward": 0.722832441329956,
"reward_std": 0.20725570619106293,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.6541945338249207,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7914702892303467,
"step": 47
},
{
"adv/mean_abs_final_conf": 0.8032364845275879,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7610890865325928,
"adv/std_final_conf": 0.9366337656974792,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935653805732727,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.552454916414374,
"calib/avg_num_step_conf": 5.15234375,
"calib/ece": 0.243355421686747,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.09236947791164658,
"calib/gap": 0.05796554561010914,
"calib/mean_conf": 0.46853212851405623,
"calib/mu_c": 0.5015887850467289,
"calib/mu_w": 0.44362323943661974,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.14108433734939763,
"calib/std_conf": 0.3066094013429056,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4177630975143403,
"calib/step_q_c_n": 523.0,
"calib/step_q_gap": 0.03488998193645093,
"calib/step_q_w": 0.3828731155778894,
"calib/step_q_w_n": 796.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2527.0,
"completions/max_terminated_length": 2527.0,
"completions/mean_length": 504.65234375,
"completions/mean_terminated_length": 504.65234375,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.0512,
"grad_norm": 0.02584809623658657,
"kl": 0.1268463134765625,
"learning_rate": 4.222222222222223e-06,
"loss": -0.0833,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03356067091226578,
"mask/share_reasoning": 0.8469815254211426,
"mask/share_step_conf": 0.11945784091949463,
"num_tokens": 11591154.0,
"reward": 0.7318041324615479,
"reward_std": 0.21007341146469116,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.6651387214660645,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7984695434570312,
"step": 48
},
{
"adv/mean_abs_final_conf": 0.7968345284461975,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7676880955696106,
"adv/std_final_conf": 0.9365724325180054,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351038932800293,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6370830608240681,
"calib/avg_num_step_conf": 5.69921875,
"calib/ece": 0.18111686746987948,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.07630522088353414,
"calib/gap": 0.14417363636363645,
"calib/mean_conf": 0.4253088353413655,
"calib/mu_c": 0.48900000000000005,
"calib/mu_w": 0.3448263636363636,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.02409638554216871,
"calib/std_conf": 0.2932193968704206,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.40479230769230773,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.0026721875721875676,
"calib/step_q_w": 0.40212012012012016,
"calib/step_q_w_n": 666.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3065.0,
"completions/max_terminated_length": 3065.0,
"completions/mean_length": 515.71875,
"completions/mean_terminated_length": 517.7412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.030575614422559738,
"kl": 0.1236572265625,
"learning_rate": 4.194444444444445e-06,
"loss": -0.0208,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032488685101270676,
"mask/share_reasoning": 0.8429965376853943,
"mask/share_step_conf": 0.12060855329036713,
"num_tokens": 11827714.0,
"reward": 0.7448811531066895,
"reward_std": 0.1802704930305481,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.6911728978157043,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7985894083976746,
"step": 49
},
{
"adv/mean_abs_final_conf": 0.7772702574729919,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.742446780204773,
"adv/std_final_conf": 0.9365054965019226,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935427188873291,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.5517457139281692,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.2025691699604743,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.05533596837944664,
"calib/gap": 0.04930421724440004,
"calib/mean_conf": 0.42454545454545456,
"calib/mu_c": 0.44832061068702295,
"calib/mu_w": 0.3990163934426229,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.05466403162055336,
"calib/std_conf": 0.26587370123610027,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.40103092783505156,
"calib/step_q_c_n": 679.0,
"calib/step_q_gap": 0.04098436402018485,
"calib/step_q_w": 0.3600465638148667,
"calib/step_q_w_n": 713.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 522.71875,
"completions/mean_terminated_length": 522.71875,
"completions/min_length": 112.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.02779863029718399,
"kl": 0.1237640380859375,
"learning_rate": 4.166666666666667e-06,
"loss": -0.1123,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03340587019920349,
"mask/share_reasoning": 0.8507226705551147,
"mask/share_step_conf": 0.11587147414684296,
"num_tokens": 12066890.0,
"reward": 0.7342731952667236,
"reward_std": 0.19280801713466644,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.6673511862754822,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8011952638626099,
"step": 50
},
{
"adv/mean_abs_final_conf": 0.7935712337493896,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7670265436172485,
"adv/std_final_conf": 0.9364770650863647,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352994561195374,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6248664886515354,
"calib/avg_num_step_conf": 4.90625,
"calib/ece": 0.1584615384615385,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.044534412955465584,
"calib/gap": 0.11627236315086775,
"calib/mean_conf": 0.4502024291497976,
"calib/mu_c": 0.5005714285714286,
"calib/mu_w": 0.3842990654205608,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.020931174089068832,
"calib/std_conf": 0.2637451373655593,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.40795526315789477,
"calib/step_q_c_n": 684.0,
"calib/step_q_gap": 0.030193025395657047,
"calib/step_q_w": 0.3777622377622377,
"calib/step_q_w_n": 572.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2014.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 503.4765625,
"completions/mean_terminated_length": 509.4466552734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.0544,
"grad_norm": 0.03294004499912262,
"kl": 0.1114349365234375,
"learning_rate": 4.138888888888889e-06,
"loss": -0.1588,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03207635134458542,
"mask/share_reasoning": 0.8514719605445862,
"mask/share_step_conf": 0.104732945561409,
"num_tokens": 12305076.0,
"reward": 0.7408157587051392,
"reward_std": 0.18963393568992615,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6903417706489563,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7912896871566772,
"step": 51
},
{
"adv/mean_abs_final_conf": 0.7684762477874756,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7616976499557495,
"adv/std_final_conf": 0.9363729357719421,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348715543746948,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.6600189933523266,
"calib/avg_num_step_conf": 4.9296875,
"calib/ece": 0.21298418972332014,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.04743083003952569,
"calib/gap": 0.12831908831908823,
"calib/mean_conf": 0.4525494071146246,
"calib/mu_c": 0.49870370370370365,
"calib/mu_w": 0.3703846153846154,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.01260869565217391,
"calib/std_conf": 0.24637194023352396,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.43677819083023545,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.08304192709397173,
"calib/step_q_w": 0.3537362637362637,
"calib/step_q_w_n": 455.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1894.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 487.19921875,
"completions/mean_terminated_length": 489.1098327636719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.036368850618600845,
"kl": 0.1165313720703125,
"learning_rate": 4.111111111111111e-06,
"loss": -0.0632,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.034070298075675964,
"mask/share_reasoning": 0.850233256816864,
"mask/share_step_conf": 0.11179022490978241,
"num_tokens": 12537751.0,
"reward": 0.7687431573867798,
"reward_std": 0.1700069010257721,
"rewards/accuracy_reward_step": 0.63671875,
"rewards/final_brier_reward_step": 0.7088756561279297,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8286106586456299,
"step": 52
},
{
"adv/mean_abs_final_conf": 0.7876460552215576,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7616404294967651,
"adv/std_final_conf": 0.9364662766456604,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351630806922913,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.5511803011803011,
"calib/avg_num_step_conf": 5.45703125,
"calib/ece": 0.19147470355731222,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.07905138339920949,
"calib/gap": 0.03547876136209471,
"calib/mean_conf": 0.5397505928853755,
"calib/mu_c": 0.5525117283950617,
"calib/mu_w": 0.517032967032967,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.045454545454545456,
"calib/std_conf": 0.2577434600437044,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.45877871825876665,
"calib/step_q_c_n": 827.0,
"calib/step_q_gap": 0.06298222703069645,
"calib/step_q_w": 0.3957964912280702,
"calib/step_q_w_n": 570.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2504.0,
"completions/max_terminated_length": 2504.0,
"completions/mean_length": 536.9609375,
"completions/mean_terminated_length": 541.18896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.04446394741535187,
"kl": 0.101318359375,
"learning_rate": 4.083333333333334e-06,
"loss": -0.114,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.030670396983623505,
"mask/share_reasoning": 0.8516263961791992,
"mask/share_step_conf": 0.10989070683717728,
"num_tokens": 12781037.0,
"reward": 0.7480237483978271,
"reward_std": 0.17704400420188904,
"rewards/accuracy_reward_step": 0.6328125,
"rewards/final_brier_reward_step": 0.6884797215461731,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8075677752494812,
"step": 53
},
{
"adv/mean_abs_final_conf": 0.7647438049316406,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7672848701477051,
"adv/std_final_conf": 0.9361473917961121,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348371624946594,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.646909903201787,
"calib/avg_num_step_conf": 4.93359375,
"calib/ece": 0.11072289156626511,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.10040160642570281,
"calib/gap": 0.11389873417721519,
"calib/mean_conf": 0.6158634538152611,
"calib/mu_c": 0.652,
"calib/mu_w": 0.5381012658227848,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.021927710843373513,
"calib/std_conf": 0.22712662425064317,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4767085427135679,
"calib/step_q_c_n": 796.0,
"calib/step_q_gap": 0.05126957055082698,
"calib/step_q_w": 0.4254389721627409,
"calib/step_q_w_n": 467.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2736.0,
"completions/max_terminated_length": 2736.0,
"completions/mean_length": 480.26953125,
"completions/mean_terminated_length": 482.1529846191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.0576,
"grad_norm": 0.04478487744927406,
"kl": 0.1110992431640625,
"learning_rate": 4.055555555555556e-06,
"loss": -0.1372,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03677823022007942,
"mask/share_reasoning": 0.8410859107971191,
"mask/share_step_conf": 0.11822962015867233,
"num_tokens": 13010218.0,
"reward": 0.7791956663131714,
"reward_std": 0.15325689315795898,
"rewards/accuracy_reward_step": 0.6640625,
"rewards/final_brier_reward_step": 0.7515324354171753,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8068588972091675,
"step": 54
},
{
"adv/mean_abs_final_conf": 0.8050023317337036,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7768208980560303,
"adv/std_final_conf": 0.9359601140022278,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349936246871948,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6821805006587616,
"calib/avg_num_step_conf": 4.4765625,
"calib/ece": 0.17230769230769225,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.12550607287449392,
"calib/gap": 0.1376495388669302,
"calib/mean_conf": 0.6965182186234817,
"calib/mu_c": 0.7606060606060606,
"calib/mu_w": 0.6229565217391304,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.16720647773279343,
"calib/std_conf": 0.21997244645283676,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5090816326530613,
"calib/step_q_c_n": 588.0,
"calib/step_q_gap": 0.048275181040158044,
"calib/step_q_w": 0.46080645161290323,
"calib/step_q_w_n": 558.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2606.0,
"completions/max_terminated_length": 2606.0,
"completions/mean_length": 494.07421875,
"completions/mean_terminated_length": 496.01177978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.02491808868944645,
"kl": 0.1061553955078125,
"learning_rate": 4.027777777777779e-06,
"loss": -0.0768,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.033575639128685,
"mask/share_reasoning": 0.8579612970352173,
"mask/share_step_conf": 0.10455679893493652,
"num_tokens": 13244525.0,
"reward": 0.7591003775596619,
"reward_std": 0.17801684141159058,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7125382423400879,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8056625127792358,
"step": 55
},
{
"adv/mean_abs_final_conf": 0.7873808145523071,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7654106020927429,
"adv/std_final_conf": 0.9311465620994568,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346365928649902,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6846971955019645,
"calib/avg_num_step_conf": 5.265625,
"calib/ece": 0.29641975308641977,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.32510288065843623,
"calib/gap": 0.10250237095244541,
"calib/mean_conf": 0.7961728395061728,
"calib/mu_c": 0.8472131147540983,
"calib/mu_w": 0.7447107438016529,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.2952674897119342,
"calib/std_conf": 0.17294282626148935,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.538655589123867,
"calib/step_q_c_n": 662.0,
"calib/step_q_gap": 0.07869203227255511,
"calib/step_q_w": 0.4599635568513119,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2482.0,
"completions/max_terminated_length": 2482.0,
"completions/mean_length": 525.75390625,
"completions/mean_terminated_length": 525.75390625,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.02171236462891102,
"kl": 0.0927276611328125,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0945,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03197760134935379,
"mask/share_reasoning": 0.8525564670562744,
"mask/share_step_conf": 0.11546589434146881,
"num_tokens": 13485958.0,
"reward": 0.7075331211090088,
"reward_std": 0.17797213792800903,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.6334031224250793,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.7816630601882935,
"step": 56
},
{
"adv/mean_abs_final_conf": 0.7610681653022766,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7601795196533203,
"adv/std_final_conf": 0.9341049194335938,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350772500038147,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6245230662504336,
"calib/avg_num_step_conf": 5.09375,
"calib/ece": 0.2414516129032258,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.4717741935483871,
"calib/gap": 0.058666666666666534,
"calib/mean_conf": 0.8558064516129031,
"calib/mu_c": 0.8778064516129033,
"calib/mu_w": 0.8191397849462367,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.23612903225806453,
"calib/std_conf": 0.1289886014856173,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5579113924050633,
"calib/step_q_c_n": 790.0,
"calib/step_q_gap": 0.028339407969265595,
"calib/step_q_w": 0.5295719844357977,
"calib/step_q_w_n": 514.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2459.0,
"completions/max_terminated_length": 2459.0,
"completions/mean_length": 511.91796875,
"completions/mean_terminated_length": 513.925537109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.0608,
"grad_norm": 0.021534863859415054,
"kl": 0.09047698974609375,
"learning_rate": 3.972222222222223e-06,
"loss": -0.1495,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.033345114439725876,
"mask/share_reasoning": 0.8466142416000366,
"mask/share_step_conf": 0.11613436043262482,
"num_tokens": 13723801.0,
"reward": 0.729364812374115,
"reward_std": 0.2116679847240448,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.674622654914856,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.784106969833374,
"step": 57
},
{
"adv/mean_abs_final_conf": 0.7951971292495728,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7578577995300293,
"adv/std_final_conf": 0.9347510933876038,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353342652320862,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.5490826302397214,
"calib/avg_num_step_conf": 5.57421875,
"calib/ece": 0.37469387755102046,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.46122448979591835,
"calib/gap": 0.03215481451720903,
"calib/mean_conf": 0.8400000000000001,
"calib/mu_c": 0.8571929824561404,
"calib/mu_w": 0.8250381679389314,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.37469387755102046,
"calib/std_conf": 0.14832396974191325,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.5785714285714286,
"calib/step_q_c_n": 609.0,
"calib/step_q_gap": 0.04798463150541399,
"calib/step_q_w": 0.5305867970660146,
"calib/step_q_w_n": 818.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 592.01953125,
"completions/mean_terminated_length": 594.3411865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.02118760719895363,
"kl": 0.0829010009765625,
"learning_rate": 3.944444444444445e-06,
"loss": -0.2503,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.02853698655962944,
"mask/share_reasoning": 0.8620185852050781,
"mask/share_step_conf": 0.10553818941116333,
"num_tokens": 13981678.0,
"reward": 0.653290867805481,
"reward_std": 0.22565579414367676,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.5532636642456055,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7533180713653564,
"step": 58
},
{
"adv/mean_abs_final_conf": 0.7879300117492676,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7936879396438599,
"adv/std_final_conf": 0.9314253926277161,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934874951839447,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.590799339179057,
"calib/avg_num_step_conf": 5.6640625,
"calib/ece": 0.37725099601593626,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5776892430278885,
"calib/gap": 0.03818528402592447,
"calib/mean_conf": 0.8830677290836653,
"calib/mu_c": 0.9016279069767442,
"calib/mu_w": 0.8634426229508197,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.3731872509960159,
"calib/std_conf": 0.11880778692277857,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6050219619326501,
"calib/step_q_c_n": 683.0,
"calib/step_q_gap": 0.04757737262365391,
"calib/step_q_w": 0.5574445893089962,
"calib/step_q_w_n": 767.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 545.60546875,
"completions/mean_terminated_length": 545.60546875,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.025029340758919716,
"kl": 0.093658447265625,
"learning_rate": 3.916666666666667e-06,
"loss": -0.071,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0332186222076416,
"mask/share_reasoning": 0.8511619567871094,
"mask/share_step_conf": 0.11561942845582962,
"num_tokens": 14227601.0,
"reward": 0.6718493103981018,
"reward_std": 0.2027919441461563,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.5778699517250061,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.7658286094665527,
"step": 59
},
{
"adv/mean_abs_final_conf": 0.7543654441833496,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7350926399230957,
"adv/std_final_conf": 0.931016206741333,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350695610046387,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.6637228260869565,
"calib/avg_num_step_conf": 5.453125,
"calib/ece": 0.3563374485596708,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.5432098765432098,
"calib/gap": 0.0603362771739131,
"calib/mean_conf": 0.8830864197530863,
"calib/mu_c": 0.911640625,
"calib/mu_w": 0.8513043478260869,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3563374485596708,
"calib/std_conf": 0.10384352713739116,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.6065560821484992,
"calib/step_q_c_n": 633.0,
"calib/step_q_gap": 0.03696237310524886,
"calib/step_q_w": 0.5695937090432504,
"calib/step_q_w_n": 763.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2703.0,
"completions/max_terminated_length": 2703.0,
"completions/mean_length": 528.80859375,
"completions/mean_terminated_length": 532.972412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.064,
"grad_norm": 0.026383792981505394,
"kl": 0.08768463134765625,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0774,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.031654588878154755,
"mask/share_reasoning": 0.8491592407226562,
"mask/share_step_conf": 0.11137367784976959,
"num_tokens": 14471832.0,
"reward": 0.6921672821044922,
"reward_std": 0.22028900682926178,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6036254167556763,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7807092070579529,
"step": 60
},
{
"adv/mean_abs_final_conf": 0.7289884090423584,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7417490482330322,
"adv/std_final_conf": 0.9285774230957031,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348416924476624,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6275510204081632,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.3030677290836653,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6334661354581673,
"calib/gap": 0.027582366279845116,
"calib/mean_conf": 0.9126294820717131,
"calib/mu_c": 0.9233986928104574,
"calib/mu_w": 0.8958163265306123,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.3030677290836653,
"calib/std_conf": 0.06633443403523974,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6239313984168865,
"calib/step_q_c_n": 758.0,
"calib/step_q_gap": 0.07113139841688643,
"calib/step_q_w": 0.5528000000000001,
"calib/step_q_w_n": 550.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2027.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 458.60546875,
"completions/mean_terminated_length": 458.60546875,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.019766852259635925,
"kl": 0.0915374755859375,
"learning_rate": 3.861111111111112e-06,
"loss": -0.036,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.036816179752349854,
"mask/share_reasoning": 0.8392590284347534,
"mask/share_step_conf": 0.12392475455999374,
"num_tokens": 14693299.0,
"reward": 0.7142760157585144,
"reward_std": 0.18115657567977905,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6492277383804321,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7793242931365967,
"step": 61
},
{
"adv/mean_abs_final_conf": 0.8142984509468079,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.768386960029602,
"adv/std_final_conf": 0.9315682053565979,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351645112037659,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6190079999999999,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.3777199999999999,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.528,
"calib/gap": 0.04552000000000023,
"calib/mean_conf": 0.87452,
"calib/mu_c": 0.8972800000000002,
"calib/mu_w": 0.85176,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.3761199999999999,
"calib/std_conf": 0.12240249017074774,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5904244482173174,
"calib/step_q_c_n": 589.0,
"calib/step_q_gap": 0.04210850186858428,
"calib/step_q_w": 0.5483159463487332,
"calib/step_q_w_n": 671.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2531.0,
"completions/max_terminated_length": 2531.0,
"completions/mean_length": 520.140625,
"completions/mean_terminated_length": 520.140625,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.027727412059903145,
"kl": 0.091064453125,
"learning_rate": 3.833333333333334e-06,
"loss": -0.1161,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.033058080822229385,
"mask/share_reasoning": 0.858847975730896,
"mask/share_step_conf": 0.10809390246868134,
"num_tokens": 14933535.0,
"reward": 0.6698973178863525,
"reward_std": 0.23705357313156128,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.5798171758651733,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.7599775791168213,
"step": 62
},
{
"adv/mean_abs_final_conf": 0.7539543509483337,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7746871709823608,
"adv/std_final_conf": 0.9303356409072876,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353242516517639,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7003731343283583,
"calib/avg_num_step_conf": 4.44140625,
"calib/ece": 0.3334426229508196,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.8515625,
"calib/frac_conf_gt_0.9": 0.5286885245901639,
"calib/gap": 0.07777747625508813,
"calib/mean_conf": 0.8826229508196721,
"calib/mu_c": 0.9176865671641791,
"calib/mu_w": 0.8399090909090909,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.875,
"calib/pce": 0.3334426229508196,
"calib/std_conf": 0.10850552058697308,
"calib/step_conf_rate": 0.875,
"calib/step_q_c": 0.5954471544715446,
"calib/step_q_c_n": 615.0,
"calib/step_q_gap": 0.04335903186618062,
"calib/step_q_w": 0.552088122605364,
"calib/step_q_w_n": 522.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2700.0,
"completions/max_terminated_length": 2700.0,
"completions/mean_length": 572.15234375,
"completions/mean_terminated_length": 572.15234375,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.0672,
"grad_norm": 0.022924363613128662,
"kl": 0.08589935302734375,
"learning_rate": 3.8055555555555556e-06,
"loss": -0.1857,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.030279844999313354,
"mask/share_reasoning": 0.876987874507904,
"mask/share_step_conf": 0.09273228049278259,
"num_tokens": 15188646.0,
"reward": 0.6390208601951599,
"reward_std": 0.2594981789588928,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.570832371711731,
"rewards/format_reward_step": 0.8515625,
"rewards/step_l2_reward": 0.7072093486785889,
"step": 63
},
{
"adv/mean_abs_final_conf": 0.7875009775161743,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7415667772293091,
"adv/std_final_conf": 0.9346509575843811,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355990886688232,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6633228840125391,
"calib/avg_num_step_conf": 4.72265625,
"calib/ece": 0.29401639344262287,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9140625,
"calib/frac_conf_gt_0.9": 0.5655737704918032,
"calib/gap": 0.05561267850923057,
"calib/mean_conf": 0.8855737704918034,
"calib/mu_c": 0.9081379310344828,
"calib/mu_w": 0.8525252525252522,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.2926639344262295,
"calib/std_conf": 0.10430919658563556,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5758356940509914,
"calib/step_q_c_n": 706.0,
"calib/step_q_gap": 0.03462297039293971,
"calib/step_q_w": 0.5412127236580517,
"calib/step_q_w_n": 503.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 528.9765625,
"completions/mean_terminated_length": 533.1417236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.024568337947130203,
"kl": 0.092010498046875,
"learning_rate": 3.777777777777778e-06,
"loss": -0.2132,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03288152813911438,
"mask/share_reasoning": 0.8555514812469482,
"mask/share_step_conf": 0.10375450551509857,
"num_tokens": 15427840.0,
"reward": 0.6932743787765503,
"reward_std": 0.25233006477355957,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6326503753662109,
"rewards/format_reward_step": 0.9140625,
"rewards/step_l2_reward": 0.7538983821868896,
"step": 64
},
{
"adv/mean_abs_final_conf": 0.7839001417160034,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7552789449691772,
"adv/std_final_conf": 0.9260660409927368,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350070357322693,
"calib/answer_extract_rate": 0.99609375,
"calib/auroc": 0.6429189322787939,
"calib/avg_num_step_conf": 4.31640625,
"calib/ece": 0.37121568627450974,
"calib/final_conf_rate": 0.99609375,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.6313725490196078,
"calib/gap": 0.050913865546218795,
"calib/mean_conf": 0.9045490196078432,
"calib/mu_c": 0.9283088235294118,
"calib/mu_w": 0.877394957983193,
"calib/nonempty_final_conf_rate": 0.99609375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.37121568627450974,
"calib/std_conf": 0.09942428663836936,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.6139208633093525,
"calib/step_q_c_n": 556.0,
"calib/step_q_gap": 0.05392086330935253,
"calib/step_q_w": 0.5599999999999999,
"calib/step_q_w_n": 549.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 994.0,
"completions/max_terminated_length": 994.0,
"completions/mean_length": 424.796875,
"completions/mean_terminated_length": 426.4627685546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.028239957988262177,
"kl": 0.101104736328125,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.1308,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03776917979121208,
"mask/share_reasoning": 0.8475953340530396,
"mask/share_step_conf": 0.11072924733161926,
"num_tokens": 15641612.0,
"reward": 0.6717511415481567,
"reward_std": 0.20962993800640106,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5822335481643677,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.7612687945365906,
"step": 65
},
{
"adv/mean_abs_final_conf": 0.762721598148346,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7631368637084961,
"adv/std_final_conf": 0.9323623180389404,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354203939437866,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.709381913716814,
"calib/avg_num_step_conf": 4.87890625,
"calib/ece": 0.3873029045643154,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.48132780082987553,
"calib/gap": 0.0859990320796461,
"calib/mean_conf": 0.8561825726141079,
"calib/mu_c": 0.901858407079646,
"calib/mu_w": 0.8158593749999999,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9453125,
"calib/pce": 0.3873029045643154,
"calib/std_conf": 0.13314715857788725,
"calib/step_conf_rate": 0.9453125,
"calib/step_q_c": 0.5657032755298652,
"calib/step_q_c_n": 519.0,
"calib/step_q_gap": 0.08078546731068714,
"calib/step_q_w": 0.48491780821917807,
"calib/step_q_w_n": 730.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2623.0,
"completions/max_terminated_length": 2623.0,
"completions/mean_length": 581.00390625,
"completions/mean_terminated_length": 583.2824096679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.0704,
"grad_norm": 0.027212418615818024,
"kl": 0.08538055419921875,
"learning_rate": 3.7222222222222225e-06,
"loss": -0.158,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.031282857060432434,
"mask/share_reasoning": 0.8646191358566284,
"mask/share_step_conf": 0.10019171983003616,
"num_tokens": 15896701.0,
"reward": 0.6610898971557617,
"reward_std": 0.23560646176338196,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.5697222948074341,
"rewards/format_reward_step": 0.90234375,
"rewards/step_l2_reward": 0.7524575591087341,
"step": 66
},
{
"adv/mean_abs_final_conf": 0.7664501667022705,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7277902960777283,
"adv/std_final_conf": 0.9249961972236633,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350871443748474,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7230132450331126,
"calib/avg_num_step_conf": 4.37890625,
"calib/ece": 0.27430278884462156,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5537848605577689,
"calib/gap": 0.09163708609271526,
"calib/mean_conf": 0.8679282868525896,
"calib/mu_c": 0.9044370860927152,
"calib/mu_w": 0.8128,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.27031872509960164,
"calib/std_conf": 0.13576289838663588,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5753651685393258,
"calib/step_q_c_n": 712.0,
"calib/step_q_gap": 0.048323603747149835,
"calib/step_q_w": 0.527041564792176,
"calib/step_q_w_n": 409.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2327.0,
"completions/max_terminated_length": 2327.0,
"completions/mean_length": 519.05078125,
"completions/mean_terminated_length": 519.05078125,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.03001784160733223,
"kl": 0.09526824951171875,
"learning_rate": 3.694444444444445e-06,
"loss": -0.1272,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0325121246278286,
"mask/share_reasoning": 0.8712092638015747,
"mask/share_step_conf": 0.09627857059240341,
"num_tokens": 16134586.0,
"reward": 0.726654589176178,
"reward_std": 0.22766532003879547,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.6616469025611877,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.791662335395813,
"step": 67
},
{
"adv/mean_abs_final_conf": 0.7865017056465149,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7429081201553345,
"adv/std_final_conf": 0.9320579171180725,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935204803943634,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7609587199139438,
"calib/avg_num_step_conf": 4.5546875,
"calib/ece": 0.4028979591836734,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.49795918367346936,
"calib/gap": 0.11741898615032953,
"calib/mean_conf": 0.8559591836734693,
"calib/mu_c": 0.9201801801801803,
"calib/mu_w": 0.8027611940298508,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.4028979591836734,
"calib/std_conf": 0.13943471286001208,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5615936254980078,
"calib/step_q_c_n": 502.0,
"calib/step_q_gap": 0.09680446887150179,
"calib/step_q_w": 0.46478915662650605,
"calib/step_q_w_n": 664.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2482.0,
"completions/max_terminated_length": 2482.0,
"completions/mean_length": 507.08984375,
"completions/mean_terminated_length": 507.08984375,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.02626444399356842,
"kl": 0.10231781005859375,
"learning_rate": 3.6666666666666666e-06,
"loss": -0.2239,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03458181768655777,
"mask/share_reasoning": 0.8608101010322571,
"mask/share_step_conf": 0.10460809618234634,
"num_tokens": 16368489.0,
"reward": 0.6846904158592224,
"reward_std": 0.21365387737751007,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.586223840713501,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7831569910049438,
"step": 68
},
{
"adv/mean_abs_final_conf": 0.7961992025375366,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7524364590644836,
"adv/std_final_conf": 0.9349520206451416,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355664253234863,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6321779795062237,
"calib/avg_num_step_conf": 4.59765625,
"calib/ece": 0.36954545454545445,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.875,
"calib/frac_conf_gt_0.9": 0.32644628099173556,
"calib/gap": 0.06065951447630813,
"calib/mean_conf": 0.8169834710743801,
"calib/mu_c": 0.8498198198198196,
"calib/mu_w": 0.7891603053435114,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.36392561983471067,
"calib/std_conf": 0.1447382638283699,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_c": 0.5257009345794392,
"calib/step_q_c_n": 535.0,
"calib/step_q_gap": 0.0514953271028038,
"calib/step_q_w": 0.4742056074766354,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2717.0,
"completions/max_terminated_length": 2717.0,
"completions/mean_length": 615.87890625,
"completions/mean_terminated_length": 615.87890625,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.0736,
"grad_norm": 0.031040944159030914,
"kl": 0.08621978759765625,
"learning_rate": 3.638888888888889e-06,
"loss": -0.2658,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.029404189437627792,
"mask/share_reasoning": 0.8814509510993958,
"mask/share_step_conf": 0.08914485573768616,
"num_tokens": 16630650.0,
"reward": 0.6388742923736572,
"reward_std": 0.2512214183807373,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.5482457280158997,
"rewards/format_reward_step": 0.875,
"rewards/step_l2_reward": 0.72950279712677,
"step": 69
},
{
"adv/mean_abs_final_conf": 0.7597904801368713,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7445217370986938,
"adv/std_final_conf": 0.9348887801170349,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351232647895813,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7801652892561984,
"calib/avg_num_step_conf": 4.578125,
"calib/ece": 0.29930894308943085,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.35365853658536583,
"calib/gap": 0.16780628099173567,
"calib/mean_conf": 0.7911788617886178,
"calib/mu_c": 0.8764462809917355,
"calib/mu_w": 0.7086399999999998,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.29930894308943085,
"calib/std_conf": 0.1871171769965541,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.5376904676258993,
"calib/step_q_c_n": 556.0,
"calib/step_q_gap": 0.07898429879473051,
"calib/step_q_w": 0.4587061688311688,
"calib/step_q_w_n": 616.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2429.0,
"completions/max_terminated_length": 2429.0,
"completions/mean_length": 543.51171875,
"completions/mean_terminated_length": 543.51171875,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.028346510604023933,
"kl": 0.092010498046875,
"learning_rate": 3.6111111111111115e-06,
"loss": -0.2265,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03363034129142761,
"mask/share_reasoning": 0.8627834320068359,
"mask/share_step_conf": 0.10358622670173645,
"num_tokens": 16876781.0,
"reward": 0.7075166702270508,
"reward_std": 0.23655042052268982,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.646140992641449,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l2_reward": 0.7688924074172974,
"step": 70
},
{
"adv/mean_abs_final_conf": 0.7985207438468933,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7744780778884888,
"adv/std_final_conf": 0.9292157292366028,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356667995452881,
"calib/answer_extract_rate": 0.921875,
"calib/auroc": 0.6687994248741913,
"calib/avg_num_step_conf": 4.765625,
"calib/ece": 0.25822784810126587,
"calib/final_conf_rate": 0.92578125,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.3333333333333333,
"calib/gap": 0.0902508986340761,
"calib/mean_conf": 0.7828691983122363,
"calib/mu_c": 0.8236153846153845,
"calib/mu_w": 0.7333644859813084,
"calib/nonempty_final_conf_rate": 0.92578125,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.24628691983122367,
"calib/std_conf": 0.19327347696814814,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_c": 0.4937739130434783,
"calib/step_q_c_n": 575.0,
"calib/step_q_gap": 0.04875685877991248,
"calib/step_q_w": 0.4450170542635658,
"calib/step_q_w_n": 645.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2772.0,
"completions/max_terminated_length": 2772.0,
"completions/mean_length": 542.9296875,
"completions/mean_terminated_length": 547.2047119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.026825150474905968,
"kl": 0.08586883544921875,
"learning_rate": 3.5833333333333335e-06,
"loss": -0.2883,
"mask/has_final_conf_rate": 0.92578125,
"mask/share_final_conf": 0.03389473259449005,
"mask/share_reasoning": 0.8552953600883484,
"mask/share_step_conf": 0.10299739241600037,
"num_tokens": 17120179.0,
"reward": 0.6880367994308472,
"reward_std": 0.23565784096717834,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6271457076072693,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l2_reward": 0.748927891254425,
"step": 71
},
{
"adv/mean_abs_final_conf": 0.7596650123596191,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7277320623397827,
"adv/std_final_conf": 0.9333326816558838,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347254037857056,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7557932263814617,
"calib/avg_num_step_conf": 4.77734375,
"calib/ece": 0.2757370517928288,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.32270916334661354,
"calib/gap": 0.13253692386045335,
"calib/mean_conf": 0.8016334661354582,
"calib/mu_c": 0.864469696969697,
"calib/mu_w": 0.7319327731092437,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.2757370517928288,
"calib/std_conf": 0.15685695616309417,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.5198018292682927,
"calib/step_q_c_n": 656.0,
"calib/step_q_gap": 0.06816161762808104,
"calib/step_q_w": 0.4516402116402116,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2340.0,
"completions/max_terminated_length": 2340.0,
"completions/mean_length": 476.59375,
"completions/mean_terminated_length": 476.59375,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.0768,
"grad_norm": 0.02857062965631485,
"kl": 0.1085052490234375,
"learning_rate": 3.555555555555556e-06,
"loss": -0.1094,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.034555524587631226,
"mask/share_reasoning": 0.8587918877601624,
"mask/share_step_conf": 0.10665258765220642,
"num_tokens": 17346595.0,
"reward": 0.7392154932022095,
"reward_std": 0.20494453608989716,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.6732785105705261,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.8051523566246033,
"step": 72
},
{
"adv/mean_abs_final_conf": 0.7999264001846313,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7472314834594727,
"adv/std_final_conf": 0.9317479133605957,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346880912780762,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6718409586056645,
"calib/avg_num_step_conf": 4.32421875,
"calib/ece": 0.17265060240963856,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.321285140562249,
"calib/gap": 0.10831290849673203,
"calib/mean_conf": 0.7830120481927711,
"calib/mu_c": 0.8247712418300653,
"calib/mu_w": 0.7164583333333333,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.17060240963855422,
"calib/std_conf": 0.18381544092024615,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5335294117647059,
"calib/step_q_c_n": 663.0,
"calib/step_q_gap": 0.08039878113407528,
"calib/step_q_w": 0.45313063063063064,
"calib/step_q_w_n": 444.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2091.0,
"completions/max_terminated_length": 2091.0,
"completions/mean_length": 474.546875,
"completions/mean_terminated_length": 474.546875,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.028949644416570663,
"kl": 0.09024810791015625,
"learning_rate": 3.5277777777777784e-06,
"loss": -0.1175,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03428073972463608,
"mask/share_reasoning": 0.8634704351425171,
"mask/share_step_conf": 0.10224878787994385,
"num_tokens": 17575111.0,
"reward": 0.7635636925697327,
"reward_std": 0.18511897325515747,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7138800621032715,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8132473826408386,
"step": 73
},
{
"adv/mean_abs_final_conf": 0.7531702518463135,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7528907656669617,
"adv/std_final_conf": 0.9215183258056641,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350585341453552,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.6780112044817928,
"calib/avg_num_step_conf": 4.953125,
"calib/ece": 0.277551867219917,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.890625,
"calib/frac_conf_gt_0.9": 0.25311203319502074,
"calib/gap": 0.1256932773109246,
"calib/mean_conf": 0.7104979253112034,
"calib/mu_c": 0.7814285714285716,
"calib/mu_w": 0.655735294117647,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.27618257261410784,
"calib/std_conf": 0.22586064304938835,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.47659090909090907,
"calib/step_q_c_n": 528.0,
"calib/step_q_gap": 0.061158476658476624,
"calib/step_q_w": 0.41543243243243244,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2621.0,
"completions/max_terminated_length": 2621.0,
"completions/mean_length": 520.171875,
"completions/mean_terminated_length": 522.2117919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.04748916998505592,
"kl": 0.106292724609375,
"learning_rate": 3.5e-06,
"loss": -0.2116,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.034913722425699234,
"mask/share_reasoning": 0.8536270260810852,
"mask/share_step_conf": 0.10755302011966705,
"num_tokens": 17812203.0,
"reward": 0.6924230456352234,
"reward_std": 0.2192223221063614,
"rewards/accuracy_reward_step": 0.41015625,
"rewards/final_brier_reward_step": 0.6199167966842651,
"rewards/format_reward_step": 0.890625,
"rewards/step_l2_reward": 0.7649291753768921,
"step": 74
},
{
"adv/mean_abs_final_conf": 0.7570982575416565,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7516752481460571,
"adv/std_final_conf": 0.9306212067604065,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934241771697998,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7717879604672058,
"calib/avg_num_step_conf": 4.98828125,
"calib/ece": 0.12952000000000005,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.36,
"calib/gap": 0.20682148040638615,
"calib/mean_conf": 0.76,
"calib/mu_c": 0.8352830188679246,
"calib/mu_w": 0.6284615384615384,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12676000000000004,
"calib/std_conf": 0.21719484340103473,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5181912144702843,
"calib/step_q_c_n": 774.0,
"calib/step_q_gap": 0.0815311747088529,
"calib/step_q_w": 0.43666003976143136,
"calib/step_q_w_n": 503.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2728.0,
"completions/max_terminated_length": 2728.0,
"completions/mean_length": 458.69921875,
"completions/mean_terminated_length": 458.69921875,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.08,
"grad_norm": 0.03306273743510246,
"kl": 0.1181640625,
"learning_rate": 3.4722222222222224e-06,
"loss": -0.085,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.036380767822265625,
"mask/share_reasoning": 0.8421682119369507,
"mask/share_step_conf": 0.1214509978890419,
"num_tokens": 18034382.0,
"reward": 0.8006812334060669,
"reward_std": 0.1714782565832138,
"rewards/accuracy_reward_step": 0.62109375,
"rewards/final_brier_reward_step": 0.7690855264663696,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8322769403457642,
"step": 75
},
{
"adv/mean_abs_final_conf": 0.7428159117698669,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7283253073692322,
"adv/std_final_conf": 0.9327108860015869,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351808428764343,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7631460818263435,
"calib/avg_num_step_conf": 4.35546875,
"calib/ece": 0.11960159362549802,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.27091633466135456,
"calib/gap": 0.23035462357316916,
"calib/mean_conf": 0.6670916334661354,
"calib/mu_c": 0.7707971014492754,
"calib/mu_w": 0.5404424778761062,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.11844621513944223,
"calib/std_conf": 0.25134678582140274,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5085766423357663,
"calib/step_q_c_n": 548.0,
"calib/step_q_gap": 0.09956429665675398,
"calib/step_q_w": 0.4090123456790123,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2749.0,
"completions/max_terminated_length": 2749.0,
"completions/mean_length": 471.109375,
"completions/mean_terminated_length": 471.109375,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.03533868491649628,
"kl": 0.117095947265625,
"learning_rate": 3.444444444444445e-06,
"loss": -0.1891,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03789925202727318,
"mask/share_reasoning": 0.8529291152954102,
"mask/share_step_conf": 0.10917161405086517,
"num_tokens": 18258042.0,
"reward": 0.7864531874656677,
"reward_std": 0.18198052048683167,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7521445751190186,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8207618594169617,
"step": 76
},
{
"adv/mean_abs_final_conf": 0.7651739120483398,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7566498517990112,
"adv/std_final_conf": 0.9274589419364929,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351546764373779,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.6154385964912281,
"calib/avg_num_step_conf": 4.5703125,
"calib/ece": 0.15612244897959182,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.1469387755102041,
"calib/gap": 0.0977754385964913,
"calib/mean_conf": 0.6050204081632653,
"calib/mu_c": 0.6429333333333334,
"calib/mu_w": 0.5451578947368421,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.07444897959183673,
"calib/std_conf": 0.2362714186205692,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.4585238784370478,
"calib/step_q_c_n": 691.0,
"calib/step_q_gap": 0.04535060077525238,
"calib/step_q_w": 0.4131732776617954,
"calib/step_q_w_n": 479.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2691.0,
"completions/max_terminated_length": 2691.0,
"completions/mean_length": 483.76953125,
"completions/mean_terminated_length": 483.76953125,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.044293832033872604,
"kl": 0.11598968505859375,
"learning_rate": 3.416666666666667e-06,
"loss": -0.2489,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03868642449378967,
"mask/share_reasoning": 0.8451880812644958,
"mask/share_step_conf": 0.11612546443939209,
"num_tokens": 18486551.0,
"reward": 0.7469485998153687,
"reward_std": 0.19202597439289093,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.6997886896133423,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.794108510017395,
"step": 77
},
{
"adv/mean_abs_final_conf": 0.7482144832611084,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7161533832550049,
"adv/std_final_conf": 0.9358103275299072,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349690079689026,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7036554354736172,
"calib/avg_num_step_conf": 4.5859375,
"calib/ece": 0.08665338645418323,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.09561752988047809,
"calib/gap": 0.16321233312142414,
"calib/mean_conf": 0.5792430278884463,
"calib/mu_c": 0.6579230769230769,
"calib/mu_w": 0.4947107438016528,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.07398406374501992,
"calib/std_conf": 0.22992252441260763,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.47225155279503106,
"calib/step_q_c_n": 644.0,
"calib/step_q_gap": 0.059949666002578283,
"calib/step_q_w": 0.4123018867924528,
"calib/step_q_w_n": 530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2390.0,
"completions/max_terminated_length": 2390.0,
"completions/mean_length": 518.328125,
"completions/mean_terminated_length": 518.328125,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.0832,
"grad_norm": 0.038161709904670715,
"kl": 0.1164093017578125,
"learning_rate": 3.3888888888888893e-06,
"loss": -0.1816,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.032433316111564636,
"mask/share_reasoning": 0.8684175610542297,
"mask/share_step_conf": 0.09914913028478622,
"num_tokens": 18727267.0,
"reward": 0.7633633613586426,
"reward_std": 0.20355360209941864,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7194637060165405,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.8072630167007446,
"step": 78
},
{
"adv/mean_abs_final_conf": 0.7439166307449341,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7461695075035095,
"adv/std_final_conf": 0.9325432181358337,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348044991493225,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6385620915032678,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.12332015810276681,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.18181818181818182,
"calib/gap": 0.12320457516339867,
"calib/mean_conf": 0.6094071146245059,
"calib/mu_c": 0.6581045751633987,
"calib/mu_w": 0.5349,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.0639920948616601,
"calib/std_conf": 0.252071348416406,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.45882926829268295,
"calib/step_q_c_n": 820.0,
"calib/step_q_gap": 0.038238550993104914,
"calib/step_q_w": 0.42059071729957803,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1346.0,
"completions/max_terminated_length": 1346.0,
"completions/mean_length": 489.1875,
"completions/mean_terminated_length": 491.1059265136719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.03568899631500244,
"kl": 0.105499267578125,
"learning_rate": 3.3611111111111117e-06,
"loss": -0.1682,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03434523940086365,
"mask/share_reasoning": 0.8567074537277222,
"mask/share_step_conf": 0.10504105687141418,
"num_tokens": 18958875.0,
"reward": 0.7738837003707886,
"reward_std": 0.18652935326099396,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7215226292610168,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8262448310852051,
"step": 79
},
{
"adv/mean_abs_final_conf": 0.7373925447463989,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7588784098625183,
"adv/std_final_conf": 0.9170923829078674,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348174333572388,
"calib/answer_extract_rate": 1.0,
"calib/auroc": 0.6928098391674551,
"calib/avg_num_step_conf": 5.02734375,
"calib/ece": 0.19398437500000007,
"calib/final_conf_rate": 1.0,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.3984375,
"calib/gap": 0.14165121412803539,
"calib/mean_conf": 0.7442187499999999,
"calib/mu_c": 0.802317880794702,
"calib/mu_w": 0.6606666666666666,
"calib/nonempty_final_conf_rate": 1.0,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.17417968750000007,
"calib/std_conf": 0.24461739696603244,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4986915887850467,
"calib/step_q_c_n": 749.0,
"calib/step_q_gap": 0.052037313692109954,
"calib/step_q_w": 0.44665427509293676,
"calib/step_q_w_n": 538.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1238.0,
"completions/max_terminated_length": 1238.0,
"completions/mean_length": 424.6015625,
"completions/mean_terminated_length": 426.2666931152344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.040913067758083344,
"kl": 0.1164703369140625,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0902,
"mask/has_final_conf_rate": 0.99609375,
"mask/share_final_conf": 0.03784197196364403,
"mask/share_reasoning": 0.8360422849655151,
"mask/share_step_conf": 0.12220950424671173,
"num_tokens": 19169733.0,
"reward": 0.779463529586792,
"reward_std": 0.15755271911621094,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7320824265480042,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8268446922302246,
"step": 80
},
{
"adv/mean_abs_final_conf": 0.7503237724304199,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7460405826568604,
"adv/std_final_conf": 0.9145488142967224,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935416042804718,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.719396551724138,
"calib/avg_num_step_conf": 4.859375,
"calib/ece": 0.14979253112033195,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.3817427385892116,
"calib/gap": 0.21781681034482758,
"calib/mean_conf": 0.7126141078838175,
"calib/mu_c": 0.7993793103448276,
"calib/mu_w": 0.5815625,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.1303734439834025,
"calib/std_conf": 0.27592209129547424,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.5065610142630744,
"calib/step_q_c_n": 631.0,
"calib/step_q_gap": 0.09197700121250346,
"calib/step_q_w": 0.41458401305057097,
"calib/step_q_w_n": 613.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2776.0,
"completions/max_terminated_length": 2776.0,
"completions/mean_length": 506.01953125,
"completions/mean_terminated_length": 512.019775390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.0864,
"grad_norm": 0.026412300765514374,
"kl": 0.104217529296875,
"learning_rate": 3.3055555555555558e-06,
"loss": -0.1094,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.03644543141126633,
"mask/share_reasoning": 0.8455771207809448,
"mask/share_step_conf": 0.10625872015953064,
"num_tokens": 19405522.0,
"reward": 0.7318441867828369,
"reward_std": 0.2443157136440277,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.6915718913078308,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l2_reward": 0.7721166610717773,
"step": 81
},
{
"adv/mean_abs_final_conf": 0.7479487657546997,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7465330362319946,
"adv/std_final_conf": 0.8972213864326477,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349361658096313,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7302356902356902,
"calib/avg_num_step_conf": 4.953125,
"calib/ece": 0.19634538152610445,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.4859437751004016,
"calib/gap": 0.20788484848484867,
"calib/mean_conf": 0.7700803212851406,
"calib/mu_c": 0.8527333333333335,
"calib/mu_w": 0.6448484848484848,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.18200803212851407,
"calib/std_conf": 0.2656491510623321,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.518390977443609,
"calib/step_q_c_n": 665.0,
"calib/step_q_gap": 0.09661651641541663,
"calib/step_q_w": 0.4217744610281923,
"calib/step_q_w_n": 603.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2958.0,
"completions/max_terminated_length": 2958.0,
"completions/mean_length": 467.06640625,
"completions/mean_terminated_length": 468.8980712890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.02705484628677368,
"kl": 0.11284637451171875,
"learning_rate": 3.277777777777778e-06,
"loss": -0.047,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.038292378187179565,
"mask/share_reasoning": 0.8431802988052368,
"mask/share_step_conf": 0.1146211326122284,
"num_tokens": 19630643.0,
"reward": 0.7625449895858765,
"reward_std": 0.19928604364395142,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7182894945144653,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8068004846572876,
"step": 82
},
{
"adv/mean_abs_final_conf": 0.7600466012954712,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7641905546188354,
"adv/std_final_conf": 0.9219870567321777,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346417784690857,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7885663034812649,
"calib/avg_num_step_conf": 4.84765625,
"calib/ece": 0.2078225806451613,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.5403225806451613,
"calib/gap": 0.2661785809194791,
"calib/mean_conf": 0.7763709677419356,
"calib/mu_c": 0.8901408450704225,
"calib/mu_w": 0.6239622641509434,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.20580645161290323,
"calib/std_conf": 0.2691888492796142,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.523275039745628,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.12392863451686975,
"calib/step_q_w": 0.3993464052287582,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2582.0,
"completions/max_terminated_length": 2582.0,
"completions/mean_length": 526.734375,
"completions/mean_terminated_length": 528.800048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.08767831325531006,
"kl": 0.1004638671875,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.1739,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03471631184220314,
"mask/share_reasoning": 0.8556775450706482,
"mask/share_step_conf": 0.10569989681243896,
"num_tokens": 19872751.0,
"reward": 0.7552716732025146,
"reward_std": 0.20535266399383545,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7049202919006348,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l2_reward": 0.8056230545043945,
"step": 83
},
{
"adv/mean_abs_final_conf": 0.7469096779823303,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7453149557113647,
"adv/std_final_conf": 0.9164627194404602,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347103238105774,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.75181598062954,
"calib/avg_num_step_conf": 4.5546875,
"calib/ece": 0.2758565737051793,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.5617529880478087,
"calib/gap": 0.21050019115585572,
"calib/mean_conf": 0.7904382470119522,
"calib/mu_c": 0.8893984962406015,
"calib/mu_w": 0.6788983050847458,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.26820717131474103,
"calib/std_conf": 0.2728136689746266,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5189006309148264,
"calib/step_q_c_n": 634.0,
"calib/step_q_gap": 0.07365627001257075,
"calib/step_q_w": 0.44524436090225566,
"calib/step_q_w_n": 532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2271.0,
"completions/max_terminated_length": 2271.0,
"completions/mean_length": 443.578125,
"completions/mean_terminated_length": 443.578125,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.0896,
"grad_norm": 0.03372732549905777,
"kl": 0.11348724365234375,
"learning_rate": 3.2222222222222227e-06,
"loss": -0.0607,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03929414972662926,
"mask/share_reasoning": 0.8493955135345459,
"mask/share_step_conf": 0.11131033301353455,
"num_tokens": 20092227.0,
"reward": 0.7540429830551147,
"reward_std": 0.19966432452201843,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.6865949630737305,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8214911222457886,
"step": 84
},
{
"adv/mean_abs_final_conf": 0.7252588868141174,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7495740056037903,
"adv/std_final_conf": 0.8995879888534546,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350873231887817,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.721048951048951,
"calib/avg_num_step_conf": 4.2890625,
"calib/ece": 0.26483333333333337,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.8828125,
"calib/frac_conf_gt_0.9": 0.625,
"calib/gap": 0.23826573426573439,
"calib/mean_conf": 0.8063333333333333,
"calib/mu_c": 0.9155384615384616,
"calib/mu_w": 0.6772727272727272,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.26475000000000004,
"calib/std_conf": 0.27401774070709767,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.512791519434629,
"calib/step_q_c_n": 566.0,
"calib/step_q_gap": 0.06890054199101991,
"calib/step_q_w": 0.44389097744360906,
"calib/step_q_w_n": 532.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2949.0,
"completions/max_terminated_length": 2949.0,
"completions/mean_length": 521.86328125,
"completions/mean_terminated_length": 528.0513916015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.04208715260028839,
"kl": 0.097900390625,
"learning_rate": 3.1944444444444443e-06,
"loss": -0.1331,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.035177890211343765,
"mask/share_reasoning": 0.8520656824111938,
"mask/share_step_conf": 0.10103766620159149,
"num_tokens": 20333648.0,
"reward": 0.6964380145072937,
"reward_std": 0.25430044531822205,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.6360054612159729,
"rewards/format_reward_step": 0.8828125,
"rewards/step_l2_reward": 0.7568705081939697,
"step": 85
},
{
"adv/mean_abs_final_conf": 0.7735016345977783,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7450942993164062,
"adv/std_final_conf": 0.9120759963989258,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9358606934547424,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.726973241898615,
"calib/avg_num_step_conf": 4.22265625,
"calib/ece": 0.30175510204081624,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.5387755102040817,
"calib/gap": 0.23299852090896878,
"calib/mean_conf": 0.7548163265306123,
"calib/mu_c": 0.8822522522522523,
"calib/mu_w": 0.6492537313432836,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.30175510204081624,
"calib/std_conf": 0.2997680513854063,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.49515030060120235,
"calib/step_q_c_n": 499.0,
"calib/step_q_gap": 0.06975511159776593,
"calib/step_q_w": 0.4253951890034364,
"calib/step_q_w_n": 582.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2581.0,
"completions/max_terminated_length": 2581.0,
"completions/mean_length": 467.9921875,
"completions/mean_terminated_length": 469.8274841308594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.04277937859296799,
"kl": 0.1038665771484375,
"learning_rate": 3.1666666666666667e-06,
"loss": -0.2479,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03794998303055763,
"mask/share_reasoning": 0.8551421165466309,
"mask/share_step_conf": 0.10300161689519882,
"num_tokens": 20558966.0,
"reward": 0.6861547231674194,
"reward_std": 0.2804429829120636,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.6155492067337036,
"rewards/format_reward_step": 0.90234375,
"rewards/step_l2_reward": 0.7567602396011353,
"step": 86
},
{
"adv/mean_abs_final_conf": 0.7481697797775269,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7582245469093323,
"adv/std_final_conf": 0.9145613312721252,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349111914634705,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.714245472837022,
"calib/avg_num_step_conf": 3.984375,
"calib/ece": 0.17402439024390248,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.90234375,
"calib/frac_conf_gt_0.9": 0.6341463414634146,
"calib/gap": 0.17621167002012095,
"calib/mean_conf": 0.8381707317073173,
"calib/mu_c": 0.8890285714285716,
"calib/mu_w": 0.7128169014084507,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9140625,
"calib/pce": 0.15040650406504066,
"calib/std_conf": 0.2357351141035544,
"calib/step_conf_rate": 0.9140625,
"calib/step_q_c": 0.533342939481268,
"calib/step_q_c_n": 694.0,
"calib/step_q_gap": 0.10831226463464222,
"calib/step_q_w": 0.42503067484662577,
"calib/step_q_w_n": 326.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2601.0,
"completions/max_terminated_length": 2601.0,
"completions/mean_length": 436.5625,
"completions/mean_terminated_length": 443.4920959472656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.0928,
"grad_norm": 0.04709944874048233,
"kl": 0.120513916015625,
"learning_rate": 3.138888888888889e-06,
"loss": -0.185,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03978754207491875,
"mask/share_reasoning": 0.8435720205307007,
"mask/share_step_conf": 0.10101540386676788,
"num_tokens": 20776222.0,
"reward": 0.748905599117279,
"reward_std": 0.251054048538208,
"rewards/accuracy_reward_step": 0.6875,
"rewards/final_brier_reward_step": 0.7142261862754822,
"rewards/format_reward_step": 0.90234375,
"rewards/step_l2_reward": 0.7835850715637207,
"step": 87
},
{
"adv/mean_abs_final_conf": 0.7659087777137756,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7348840236663818,
"adv/std_final_conf": 0.9296475052833557,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355849027633667,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.8344155844155844,
"calib/avg_num_step_conf": 4.19140625,
"calib/ece": 0.15074803149606297,
"calib/final_conf_rate": 0.9921875,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.5275590551181102,
"calib/gap": 0.3850636363636363,
"calib/mean_conf": 0.7547637795275591,
"calib/mu_c": 0.9063636363636363,
"calib/mu_w": 0.5213,
"calib/nonempty_final_conf_rate": 0.9921875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.91796875,
"calib/pce": 0.14960629921259838,
"calib/std_conf": 0.2958408264482595,
"calib/step_conf_rate": 0.91796875,
"calib/step_q_c": 0.4859428571428571,
"calib/step_q_c_n": 700.0,
"calib/step_q_gap": 0.05403937188816543,
"calib/step_q_w": 0.4319034852546917,
"calib/step_q_w_n": 373.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1837.0,
"completions/max_terminated_length": 1837.0,
"completions/mean_length": 474.49609375,
"completions/mean_terminated_length": 474.49609375,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.030993729829788208,
"kl": 0.1110076904296875,
"learning_rate": 3.1111111111111116e-06,
"loss": -0.1947,
"mask/has_final_conf_rate": 0.9921875,
"mask/share_final_conf": 0.03716718778014183,
"mask/share_reasoning": 0.862873911857605,
"mask/share_step_conf": 0.09995885193347931,
"num_tokens": 21007541.0,
"reward": 0.7658560276031494,
"reward_std": 0.2499769777059555,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7555378675460815,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l2_reward": 0.7761742472648621,
"step": 88
},
{
"adv/mean_abs_final_conf": 0.7708313465118408,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7514389157295227,
"adv/std_final_conf": 0.9173318147659302,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9359842538833618,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.8226373032493415,
"calib/avg_num_step_conf": 3.84375,
"calib/ece": 0.2592622950819672,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.86328125,
"calib/frac_conf_gt_0.9": 0.48770491803278687,
"calib/gap": 0.32950753225697516,
"calib/mean_conf": 0.7145081967213115,
"calib/mu_c": 0.89141592920354,
"calib/mu_w": 0.5619083969465648,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.890625,
"calib/pce": 0.25532786885245895,
"calib/std_conf": 0.31482522463100265,
"calib/step_conf_rate": 0.890625,
"calib/step_q_c": 0.5899484536082473,
"calib/step_q_c_n": 388.0,
"calib/step_q_gap": 0.1898981180377775,
"calib/step_q_w": 0.4000503355704698,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2820.0,
"completions/max_terminated_length": 2820.0,
"completions/mean_length": 511.546875,
"completions/mean_terminated_length": 511.546875,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.03927363082766533,
"kl": 0.10543060302734375,
"learning_rate": 3.0833333333333336e-06,
"loss": -0.1832,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03691529855132103,
"mask/share_reasoning": 0.8710620999336243,
"mask/share_step_conf": 0.0920226126909256,
"num_tokens": 21247385.0,
"reward": 0.6905965805053711,
"reward_std": 0.2838398814201355,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.64203941822052,
"rewards/format_reward_step": 0.86328125,
"rewards/step_l2_reward": 0.7391537427902222,
"step": 89
},
{
"adv/mean_abs_final_conf": 0.7335283160209656,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7159548997879028,
"adv/std_final_conf": 0.9096397161483765,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355127215385437,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6708086916074159,
"calib/avg_num_step_conf": 5.3046875,
"calib/ece": 0.19730666666666682,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.572,
"calib/gap": 0.20785478547854785,
"calib/mean_conf": 0.7626933333333333,
"calib/mu_c": 0.8466666666666667,
"calib/mu_w": 0.6388118811881188,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.18200000000000016,
"calib/std_conf": 0.29880582948351814,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.47783783783783784,
"calib/step_q_c_n": 740.0,
"calib/step_q_gap": 0.08791874398670518,
"calib/step_q_w": 0.38991909385113266,
"calib/step_q_w_n": 618.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2859.0,
"completions/max_terminated_length": 2859.0,
"completions/mean_length": 506.33984375,
"completions/mean_terminated_length": 506.33984375,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.096,
"grad_norm": 0.035200610756874084,
"kl": 0.1154632568359375,
"learning_rate": 3.055555555555556e-06,
"loss": -0.0727,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03638852387666702,
"mask/share_reasoning": 0.8510505557060242,
"mask/share_step_conf": 0.11256091296672821,
"num_tokens": 21480328.0,
"reward": 0.7477128505706787,
"reward_std": 0.23210914433002472,
"rewards/accuracy_reward_step": 0.58203125,
"rewards/final_brier_reward_step": 0.6938439607620239,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8015816807746887,
"step": 90
},
{
"adv/mean_abs_final_conf": 0.7306746244430542,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7812052965164185,
"adv/std_final_conf": 0.9120433330535889,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345096349716187,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7520235467255335,
"calib/avg_num_step_conf": 4.40234375,
"calib/ece": 0.20731999999999984,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.608,
"calib/gap": 0.24788347046625203,
"calib/mean_conf": 0.7935599999999999,
"calib/mu_c": 0.8917218543046358,
"calib/mu_w": 0.6438383838383838,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.19843999999999984,
"calib/std_conf": 0.2806102749366102,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5113258785942493,
"calib/step_q_c_n": 626.0,
"calib/step_q_gap": 0.10529793448247277,
"calib/step_q_w": 0.4060279441117765,
"calib/step_q_w_n": 501.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2719.0,
"completions/max_terminated_length": 2719.0,
"completions/mean_length": 481.265625,
"completions/mean_terminated_length": 481.265625,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.039202846586704254,
"kl": 0.1165771484375,
"learning_rate": 3.0277777777777776e-06,
"loss": -0.1038,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03410191461443901,
"mask/share_reasoning": 0.8663723468780518,
"mask/share_step_conf": 0.09952573478221893,
"num_tokens": 21711244.0,
"reward": 0.780426025390625,
"reward_std": 0.19417354464530945,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7288113236427307,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8320407271385193,
"step": 91
},
{
"adv/mean_abs_final_conf": 0.7099396586418152,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7603292465209961,
"adv/std_final_conf": 0.8862280249595642,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935433566570282,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.716020436730123,
"calib/avg_num_step_conf": 4.26171875,
"calib/ece": 0.19447154471544736,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.88671875,
"calib/frac_conf_gt_0.9": 0.5691056910569106,
"calib/gap": 0.23869120940649502,
"calib/mean_conf": 0.770569105691057,
"calib/mu_c": 0.8617763157894738,
"calib/mu_w": 0.6230851063829788,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.8984375,
"calib/pce": 0.17357723577235795,
"calib/std_conf": 0.3021543266185711,
"calib/step_conf_rate": 0.8984375,
"calib/step_q_c": 0.5257218309859155,
"calib/step_q_c_n": 568.0,
"calib/step_q_gap": 0.16543693614843935,
"calib/step_q_w": 0.36028489483747617,
"calib/step_q_w_n": 523.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2613.0,
"completions/max_terminated_length": 2613.0,
"completions/mean_length": 466.65234375,
"completions/mean_terminated_length": 466.65234375,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.03666877746582031,
"kl": 0.11454010009765625,
"learning_rate": 3e-06,
"loss": -0.1683,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.037959203124046326,
"mask/share_reasoning": 0.8583852052688599,
"mask/share_step_conf": 0.10365556925535202,
"num_tokens": 21937427.0,
"reward": 0.7203973531723022,
"reward_std": 0.2536795139312744,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.6676589250564575,
"rewards/format_reward_step": 0.88671875,
"rewards/step_l2_reward": 0.7731357216835022,
"step": 92
},
{
"adv/mean_abs_final_conf": 0.6869341731071472,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7488905191421509,
"adv/std_final_conf": 0.8937593102455139,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355915188789368,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7200474898236093,
"calib/avg_num_step_conf": 4.875,
"calib/ece": 0.17467213114754096,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.3524590163934426,
"calib/gap": 0.24422523744911817,
"calib/mean_conf": 0.6580327868852459,
"calib/mu_c": 0.768134328358209,
"calib/mu_w": 0.5239090909090909,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.14176229508196717,
"calib/std_conf": 0.3129673930123625,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.5013162118780097,
"calib/step_q_c_n": 623.0,
"calib/step_q_gap": 0.09379621187800963,
"calib/step_q_w": 0.40752000000000005,
"calib/step_q_w_n": 625.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2065.0,
"completions/max_terminated_length": 2065.0,
"completions/mean_length": 487.45703125,
"completions/mean_terminated_length": 491.2952880859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.0992,
"grad_norm": 0.046914614737033844,
"kl": 0.108612060546875,
"learning_rate": 2.9722222222222225e-06,
"loss": -0.1615,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.0355367548763752,
"mask/share_reasoning": 0.8486747145652771,
"mask/share_step_conf": 0.10797599703073502,
"num_tokens": 22167992.0,
"reward": 0.7335118651390076,
"reward_std": 0.24725374579429626,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6891273260116577,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l2_reward": 0.7778963446617126,
"step": 93
},
{
"adv/mean_abs_final_conf": 0.7514272928237915,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7241270542144775,
"adv/std_final_conf": 0.9320777058601379,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356358051300049,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.753915504080021,
"calib/avg_num_step_conf": 3.984375,
"calib/ece": 0.1504048582995951,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.3562753036437247,
"calib/gap": 0.2947216372729665,
"calib/mean_conf": 0.6417408906882591,
"calib/mu_c": 0.7801526717557252,
"calib/mu_w": 0.4854310344827587,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9296875,
"calib/pce": 0.1308906882591093,
"calib/std_conf": 0.327217142534491,
"calib/step_conf_rate": 0.9296875,
"calib/step_q_c": 0.5525203252032521,
"calib/step_q_c_n": 492.0,
"calib/step_q_gap": 0.15824002217294908,
"calib/step_q_w": 0.39428030303030304,
"calib/step_q_w_n": 528.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2296.0,
"completions/max_terminated_length": 2296.0,
"completions/mean_length": 433.24609375,
"completions/mean_terminated_length": 436.657470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.03202759847044945,
"kl": 0.1334075927734375,
"learning_rate": 2.944444444444445e-06,
"loss": -0.157,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03922412917017937,
"mask/share_reasoning": 0.8538342714309692,
"mask/share_step_conf": 0.09912911057472229,
"num_tokens": 22387583.0,
"reward": 0.7599334716796875,
"reward_std": 0.21481086313724518,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7212663888931274,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7986005544662476,
"step": 94
},
{
"adv/mean_abs_final_conf": 0.7279517650604248,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7865815758705139,
"adv/std_final_conf": 0.9090731143951416,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9359537959098816,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.805735294117647,
"calib/avg_num_step_conf": 3.58984375,
"calib/ece": 0.15635593220338972,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.828125,
"calib/frac_conf_gt_0.9": 0.4576271186440678,
"calib/gap": 0.3495029411764706,
"calib/mean_conf": 0.6955084745762713,
"calib/mu_c": 0.8436029411764706,
"calib/mu_w": 0.4941,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.84765625,
"calib/pce": 0.13779661016949143,
"calib/std_conf": 0.3303300800602154,
"calib/step_conf_rate": 0.84765625,
"calib/step_q_c": 0.5449999999999999,
"calib/step_q_c_n": 506.0,
"calib/step_q_gap": 0.1501089588377723,
"calib/step_q_w": 0.3948910411622276,
"calib/step_q_w_n": 413.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1464.0,
"completions/max_terminated_length": 1464.0,
"completions/mean_length": 454.76953125,
"completions/mean_terminated_length": 456.552978515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.04344727098941803,
"kl": 0.111175537109375,
"learning_rate": 2.916666666666667e-06,
"loss": -0.3002,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.03688206151127815,
"mask/share_reasoning": 0.8678974509239197,
"mask/share_step_conf": 0.09131423383951187,
"num_tokens": 22610132.0,
"reward": 0.6868751049041748,
"reward_std": 0.2803459167480469,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6729038953781128,
"rewards/format_reward_step": 0.828125,
"rewards/step_l2_reward": 0.7008463740348816,
"step": 95
},
{
"adv/mean_abs_final_conf": 0.7357560992240906,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7510946393013,
"adv/std_final_conf": 0.8989567160606384,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351812601089478,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8593205574912893,
"calib/avg_num_step_conf": 3.69921875,
"calib/ece": 0.04665322580645161,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.8828125,
"calib/frac_conf_gt_0.9": 0.39919354838709675,
"calib/gap": 0.42188443670150994,
"calib/mean_conf": 0.65625,
"calib/mu_c": 0.7991463414634147,
"calib/mu_w": 0.37726190476190474,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.89453125,
"calib/pce": 0.020806451612903232,
"calib/std_conf": 0.32958069952592794,
"calib/step_conf_rate": 0.89453125,
"calib/step_q_c": 0.5652158894645942,
"calib/step_q_c_n": 579.0,
"calib/step_q_gap": 0.16162893294285507,
"calib/step_q_w": 0.40358695652173915,
"calib/step_q_w_n": 368.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2961.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 434.06640625,
"completions/mean_terminated_length": 434.06640625,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.1024,
"grad_norm": 0.05312202498316765,
"kl": 0.135467529296875,
"learning_rate": 2.888888888888889e-06,
"loss": -0.1138,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03864780068397522,
"mask/share_reasoning": 0.8665939569473267,
"mask/share_step_conf": 0.09475824236869812,
"num_tokens": 22827069.0,
"reward": 0.767045259475708,
"reward_std": 0.24355840682983398,
"rewards/accuracy_reward_step": 0.64453125,
"rewards/final_brier_reward_step": 0.7619215250015259,
"rewards/format_reward_step": 0.8828125,
"rewards/step_l2_reward": 0.7721689939498901,
"step": 96
},
{
"adv/mean_abs_final_conf": 0.7440429925918579,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7356593608856201,
"adv/std_final_conf": 0.9356138706207275,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9362950325012207,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7425941599661448,
"calib/avg_num_step_conf": 3.796875,
"calib/ece": 0.16780082987551875,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.87109375,
"calib/frac_conf_gt_0.9": 0.3360995850622407,
"calib/gap": 0.26464381436027645,
"calib/mean_conf": 0.6130290456431535,
"calib/mu_c": 0.7250359712230215,
"calib/mu_w": 0.4603921568627451,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.890625,
"calib/pce": 0.10203319502074698,
"calib/std_conf": 0.3310945559284828,
"calib/step_conf_rate": 0.890625,
"calib/step_q_c": 0.5561842105263157,
"calib/step_q_c_n": 532.0,
"calib/step_q_gap": 0.18259330143540664,
"calib/step_q_w": 0.3735909090909091,
"calib/step_q_w_n": 440.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2284.0,
"completions/max_terminated_length": 2284.0,
"completions/mean_length": 444.73046875,
"completions/mean_terminated_length": 448.2322692871094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.04188898950815201,
"kl": 0.128448486328125,
"learning_rate": 2.861111111111111e-06,
"loss": -0.3184,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.038415439426898956,
"mask/share_reasoning": 0.8580073118209839,
"mask/share_step_conf": 0.09576477110385895,
"num_tokens": 23045992.0,
"reward": 0.7061910629272461,
"reward_std": 0.30235719680786133,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.6707335710525513,
"rewards/format_reward_step": 0.87109375,
"rewards/step_l2_reward": 0.7416484951972961,
"step": 97
},
{
"adv/mean_abs_final_conf": 0.7775865793228149,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7880679368972778,
"adv/std_final_conf": 0.9250810146331787,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9360907077789307,
"calib/answer_extract_rate": 0.90625,
"calib/auroc": 0.6808838643371018,
"calib/avg_num_step_conf": 3.46875,
"calib/ece": 0.18421739130434783,
"calib/final_conf_rate": 0.8984375,
"calib/format_rate": 0.8125,
"calib/frac_conf_gt_0.9": 0.4217391304347826,
"calib/gap": 0.21805755395683457,
"calib/mean_conf": 0.6917826086956521,
"calib/mu_c": 0.7780575539568346,
"calib/mu_w": 0.56,
"calib/nonempty_final_conf_rate": 0.8984375,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.84375,
"calib/pce": 0.13582608695652176,
"calib/std_conf": 0.3202374739120057,
"calib/step_conf_rate": 0.84375,
"calib/step_q_c": 0.5706995884773661,
"calib/step_q_c_n": 486.0,
"calib/step_q_gap": 0.1187344143480129,
"calib/step_q_w": 0.4519651741293532,
"calib/step_q_w_n": 402.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2852.0,
"completions/max_terminated_length": 2852.0,
"completions/mean_length": 490.28515625,
"completions/mean_terminated_length": 494.1456604003906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.0338410884141922,
"kl": 0.1101226806640625,
"learning_rate": 2.8333333333333335e-06,
"loss": -0.2987,
"mask/has_final_conf_rate": 0.8984375,
"mask/share_final_conf": 0.03844252973794937,
"mask/share_reasoning": 0.8692131638526917,
"mask/share_step_conf": 0.08453181385993958,
"num_tokens": 23277689.0,
"reward": 0.6472041010856628,
"reward_std": 0.3040306270122528,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.6219742298126221,
"rewards/format_reward_step": 0.8125,
"rewards/step_l2_reward": 0.6724340915679932,
"step": 98
},
{
"adv/mean_abs_final_conf": 0.7542859315872192,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7670855522155762,
"adv/std_final_conf": 0.9212293028831482,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9358445405960083,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.7563062075016451,
"calib/avg_num_step_conf": 3.56640625,
"calib/ece": 0.1538235294117648,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.82421875,
"calib/frac_conf_gt_0.9": 0.2605042016806723,
"calib/gap": 0.3153067193097901,
"calib/mean_conf": 0.5324789915966387,
"calib/mu_c": 0.7192783505154638,
"calib/mu_w": 0.40397163120567375,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.83984375,
"calib/pce": 0.13936974789915976,
"calib/std_conf": 0.34915887762958936,
"calib/step_conf_rate": 0.83984375,
"calib/step_q_c": 0.569861111111111,
"calib/step_q_c_n": 360.0,
"calib/step_q_gap": 0.15532223226843472,
"calib/step_q_w": 0.4145388788426763,
"calib/step_q_w_n": 553.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2424.0,
"completions/max_terminated_length": 2424.0,
"completions/mean_length": 557.60546875,
"completions/mean_terminated_length": 557.60546875,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.1056,
"grad_norm": 0.03545257821679115,
"kl": 0.108612060546875,
"learning_rate": 2.805555555555556e-06,
"loss": -0.3171,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.03294376656413078,
"mask/share_reasoning": 0.8888589143753052,
"mask/share_step_conf": 0.07819731533527374,
"num_tokens": 23526236.0,
"reward": 0.6546341180801392,
"reward_std": 0.3044378459453583,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.6342898607254028,
"rewards/format_reward_step": 0.82421875,
"rewards/step_l2_reward": 0.6749784350395203,
"step": 99
},
{
"adv/mean_abs_final_conf": 0.7506855726242065,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7517011165618896,
"adv/std_final_conf": 0.9101697206497192,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9359030723571777,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.8300634943283155,
"calib/avg_num_step_conf": 3.640625,
"calib/ece": 0.10945378151260507,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.86328125,
"calib/frac_conf_gt_0.9": 0.41596638655462187,
"calib/gap": 0.4138752942855104,
"calib/mean_conf": 0.6333193277310926,
"calib/mu_c": 0.8193893129770993,
"calib/mu_w": 0.4055140186915889,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.890625,
"calib/pce": 0.09617647058823532,
"calib/std_conf": 0.3601883128571439,
"calib/step_conf_rate": 0.890625,
"calib/step_q_c": 0.5623626373626374,
"calib/step_q_c_n": 546.0,
"calib/step_q_gap": 0.1162227409895803,
"calib/step_q_w": 0.44613989637305707,
"calib/step_q_w_n": 386.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2924.0,
"completions/max_terminated_length": 2924.0,
"completions/mean_length": 543.05078125,
"completions/mean_terminated_length": 543.05078125,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.05612342059612274,
"kl": 0.34037017822265625,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.2418,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.035137295722961426,
"mask/share_reasoning": 0.8781794905662537,
"mask/share_step_conf": 0.08668322116136551,
"num_tokens": 23772665.0,
"reward": 0.7290331721305847,
"reward_std": 0.2792990803718567,
"rewards/accuracy_reward_step": 0.51171875,
"rewards/final_brier_reward_step": 0.7114968299865723,
"rewards/format_reward_step": 0.86328125,
"rewards/step_l2_reward": 0.7465694546699524,
"step": 100
},
{
"adv/mean_abs_final_conf": 0.7654350996017456,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7380661368370056,
"adv/std_final_conf": 0.9364844560623169,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9359593987464905,
"calib/answer_extract_rate": 0.90234375,
"calib/auroc": 0.7199817726133516,
"calib/avg_num_step_conf": 3.5,
"calib/ece": 0.174698275862069,
"calib/final_conf_rate": 0.90625,
"calib/format_rate": 0.8203125,
"calib/frac_conf_gt_0.9": 0.2974137931034483,
"calib/gap": 0.2618310928837246,
"calib/mean_conf": 0.5367672413793103,
"calib/mu_c": 0.686868686868687,
"calib/mu_w": 0.4250375939849624,
"calib/nonempty_final_conf_rate": 0.90625,
"calib/nonempty_reasoning_rate": 0.9296875,
"calib/nonempty_step_conf_rate": 0.8515625,
"calib/pce": 0.14237068965517247,
"calib/std_conf": 0.3526749714377204,
"calib/step_conf_rate": 0.8515625,
"calib/step_q_c": 0.5205555555555555,
"calib/step_q_c_n": 396.0,
"calib/step_q_gap": 0.0778955555555556,
"calib/step_q_w": 0.44265999999999994,
"calib/step_q_w_n": 500.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2369.0,
"completions/max_terminated_length": 2369.0,
"completions/mean_length": 500.140625,
"completions/mean_terminated_length": 506.0711669921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.030985474586486816,
"kl": 0.1172943115234375,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.302,
"mask/has_final_conf_rate": 0.90625,
"mask/share_final_conf": 0.03270100802183151,
"mask/share_reasoning": 0.8736139535903931,
"mask/share_step_conf": 0.08196623623371124,
"num_tokens": 24007693.0,
"reward": 0.6413122415542603,
"reward_std": 0.29796165227890015,
"rewards/accuracy_reward_step": 0.38671875,
"rewards/final_brier_reward_step": 0.5986918210983276,
"rewards/format_reward_step": 0.8203125,
"rewards/step_l2_reward": 0.6839326620101929,
"step": 101
},
{
"adv/mean_abs_final_conf": 0.6865018606185913,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7594526410102844,
"adv/std_final_conf": 0.8888486623764038,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339803457260132,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.8220064724919094,
"calib/avg_num_step_conf": 3.98046875,
"calib/ece": 0.11754098360655743,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.42213114754098363,
"calib/gap": 0.40786614335881016,
"calib/mean_conf": 0.6522950819672132,
"calib/mu_c": 0.824468085106383,
"calib/mu_w": 0.41660194174757287,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.91015625,
"calib/pce": 0.09598360655737712,
"calib/std_conf": 0.35119651890906567,
"calib/step_conf_rate": 0.91015625,
"calib/step_q_c": 0.6135378323108384,
"calib/step_q_c_n": 489.0,
"calib/step_q_gap": 0.21772651155612133,
"calib/step_q_w": 0.39581132075471703,
"calib/step_q_w_n": 530.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2163.0,
"completions/max_terminated_length": 2163.0,
"completions/mean_length": 441.46484375,
"completions/mean_terminated_length": 441.46484375,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1088,
"grad_norm": 0.04502592608332634,
"kl": 0.119964599609375,
"learning_rate": 2.7222222222222224e-06,
"loss": -0.1303,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.04176633059978485,
"mask/share_reasoning": 0.8533238172531128,
"mask/share_step_conf": 0.10490988194942474,
"num_tokens": 24227404.0,
"reward": 0.7619272470474243,
"reward_std": 0.2033213973045349,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7460718750953674,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l2_reward": 0.7777825593948364,
"step": 102
},
{
"adv/mean_abs_final_conf": 0.7332376837730408,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7705097198486328,
"adv/std_final_conf": 0.8930462598800659,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9359943866729736,
"calib/answer_extract_rate": 0.91796875,
"calib/auroc": 0.804710793082886,
"calib/avg_num_step_conf": 3.50390625,
"calib/ece": 0.16360515021459227,
"calib/final_conf_rate": 0.91015625,
"calib/format_rate": 0.86328125,
"calib/frac_conf_gt_0.9": 0.463519313304721,
"calib/gap": 0.3344596004770423,
"calib/mean_conf": 0.672961373390558,
"calib/mu_c": 0.8222480620155038,
"calib/mu_w": 0.4877884615384615,
"calib/nonempty_final_conf_rate": 0.91015625,
"calib/nonempty_reasoning_rate": 0.94140625,
"calib/nonempty_step_conf_rate": 0.890625,
"calib/pce": 0.14145922746781114,
"calib/std_conf": 0.34641422928406723,
"calib/step_conf_rate": 0.890625,
"calib/step_q_c": 0.6104255319148936,
"calib/step_q_c_n": 423.0,
"calib/step_q_gap": 0.16222721967860665,
"calib/step_q_w": 0.4481983122362869,
"calib/step_q_w_n": 474.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 592.2734375,
"completions/mean_terminated_length": 592.2734375,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.027462903410196304,
"kl": 0.0913848876953125,
"learning_rate": 2.6944444444444444e-06,
"loss": -0.3395,
"mask/has_final_conf_rate": 0.91015625,
"mask/share_final_conf": 0.03436726704239845,
"mask/share_reasoning": 0.8866649866104126,
"mask/share_step_conf": 0.07896770536899567,
"num_tokens": 24483578.0,
"reward": 0.7046219110488892,
"reward_std": 0.28575944900512695,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6755644679069519,
"rewards/format_reward_step": 0.86328125,
"rewards/step_l2_reward": 0.7336792349815369,
"step": 103
},
{
"adv/mean_abs_final_conf": 0.7110572457313538,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7327368855476379,
"adv/std_final_conf": 0.8901990056037903,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354986548423767,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8121157444533548,
"calib/avg_num_step_conf": 4.140625,
"calib/ece": 0.1080408163265306,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.3020408163265306,
"calib/gap": 0.36994854317027537,
"calib/mean_conf": 0.575469387755102,
"calib/mu_c": 0.7702586206896552,
"calib/mu_w": 0.40031007751937986,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.90234375,
"calib/pce": 0.1050204081632653,
"calib/std_conf": 0.3335833342654053,
"calib/step_conf_rate": 0.90234375,
"calib/step_q_c": 0.6183047210300429,
"calib/step_q_c_n": 466.0,
"calib/step_q_gap": 0.17468519241051428,
"calib/step_q_w": 0.4436195286195286,
"calib/step_q_w_n": 594.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2955.0,
"completions/max_terminated_length": 2955.0,
"completions/mean_length": 507.60546875,
"completions/mean_terminated_length": 507.60546875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.03985770791769028,
"kl": 0.111175537109375,
"learning_rate": 2.666666666666667e-06,
"loss": -0.2427,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03428574651479721,
"mask/share_reasoning": 0.8698962926864624,
"mask/share_step_conf": 0.09581795334815979,
"num_tokens": 24720205.0,
"reward": 0.7403326630592346,
"reward_std": 0.22890345752239227,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.7359715104103088,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l2_reward": 0.7446938753128052,
"step": 104
},
{
"adv/mean_abs_final_conf": 0.7528746128082275,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.761589765548706,
"adv/std_final_conf": 0.9238329529762268,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935430109500885,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.766133442712606,
"calib/avg_num_step_conf": 4.58203125,
"calib/ece": 0.16840163934426228,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.430327868852459,
"calib/gap": 0.3228370248837846,
"calib/mean_conf": 0.6450409836065574,
"calib/mu_c": 0.7852898550724638,
"calib/mu_w": 0.4624528301886793,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.12393442622950818,
"calib/std_conf": 0.35783329980801337,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.5743478260869566,
"calib/step_q_c_n": 529.0,
"calib/step_q_gap": 0.16812111801242235,
"calib/step_q_w": 0.4062267080745342,
"calib/step_q_w_n": 644.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2921.0,
"completions/max_terminated_length": 2921.0,
"completions/mean_length": 493.16015625,
"completions/mean_terminated_length": 495.0941467285156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.112,
"grad_norm": 0.05232438072562218,
"kl": 0.10643768310546875,
"learning_rate": 2.6388888888888893e-06,
"loss": -0.1443,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03616030886769295,
"mask/share_reasoning": 0.8623545169830322,
"mask/share_step_conf": 0.09757894277572632,
"num_tokens": 24952214.0,
"reward": 0.7511138319969177,
"reward_std": 0.25214970111846924,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7188429832458496,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7833847403526306,
"step": 105
},
{
"adv/mean_abs_final_conf": 0.6756178140640259,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7654324769973755,
"adv/std_final_conf": 0.8626399636268616,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351634383201599,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7351481916699308,
"calib/avg_num_step_conf": 4.26953125,
"calib/ece": 0.19839357429718887,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5261044176706827,
"calib/gap": 0.268264786525656,
"calib/mean_conf": 0.7293975903614458,
"calib/mu_c": 0.8489855072463768,
"calib/mu_w": 0.5807207207207208,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.18678714859437762,
"calib/std_conf": 0.3225239920626697,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.6267474048442906,
"calib/step_q_c_n": 578.0,
"calib/step_q_gap": 0.13886390969865953,
"calib/step_q_w": 0.48788349514563106,
"calib/step_q_w_n": 515.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2553.0,
"completions/max_terminated_length": 2553.0,
"completions/mean_length": 474.6640625,
"completions/mean_terminated_length": 474.6640625,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.031673479825258255,
"kl": 0.10186004638671875,
"learning_rate": 2.6111111111111113e-06,
"loss": -0.1221,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03562416881322861,
"mask/share_reasoning": 0.8671010732650757,
"mask/share_step_conf": 0.09727469086647034,
"num_tokens": 25178312.0,
"reward": 0.7384791970252991,
"reward_std": 0.22508171200752258,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7021632790565491,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7747950553894043,
"step": 106
},
{
"adv/mean_abs_final_conf": 0.7364275455474854,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7385661005973816,
"adv/std_final_conf": 0.8878201246261597,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356435537338257,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6748719699556163,
"calib/avg_num_step_conf": 4.2265625,
"calib/ece": 0.24524390243902444,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.90625,
"calib/frac_conf_gt_0.9": 0.6260162601626016,
"calib/gap": 0.18848685558210987,
"calib/mean_conf": 0.8042682926829267,
"calib/mu_c": 0.8816551724137931,
"calib/mu_w": 0.6931683168316832,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.921875,
"calib/pce": 0.23004065040650412,
"calib/std_conf": 0.28188548021137166,
"calib/step_conf_rate": 0.921875,
"calib/step_q_c": 0.6186688311688311,
"calib/step_q_c_n": 616.0,
"calib/step_q_gap": 0.10918385262805852,
"calib/step_q_w": 0.5094849785407726,
"calib/step_q_w_n": 466.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2361.0,
"completions/max_terminated_length": 2361.0,
"completions/mean_length": 475.24609375,
"completions/mean_terminated_length": 475.24609375,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.05143415182828903,
"kl": 0.10625457763671875,
"learning_rate": 2.5833333333333337e-06,
"loss": -0.2211,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03504911810159683,
"mask/share_reasoning": 0.862432599067688,
"mask/share_step_conf": 0.102518230676651,
"num_tokens": 25404591.0,
"reward": 0.7037363052368164,
"reward_std": 0.2831021547317505,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.6533273458480835,
"rewards/format_reward_step": 0.90625,
"rewards/step_l2_reward": 0.7541453838348389,
"step": 107
},
{
"adv/mean_abs_final_conf": 0.6004273295402527,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7567219734191895,
"adv/std_final_conf": 0.8114179372787476,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343503713607788,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7457677238116361,
"calib/avg_num_step_conf": 4.83984375,
"calib/ece": 0.18149193548387108,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.7016129032258065,
"calib/gap": 0.27782730834627056,
"calib/mean_conf": 0.8316532258064516,
"calib/mu_c": 0.9223952095808384,
"calib/mu_w": 0.6445679012345679,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.1698790322580646,
"calib/std_conf": 0.27590337089321565,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.6351222651222651,
"calib/step_q_c_n": 777.0,
"calib/step_q_gap": 0.14395343395343396,
"calib/step_q_w": 0.49116883116883114,
"calib/step_q_w_n": 462.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2503.0,
"completions/max_terminated_length": 2503.0,
"completions/mean_length": 506.25390625,
"completions/mean_terminated_length": 508.2392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.1152,
"grad_norm": 0.029269879683852196,
"kl": 0.087646484375,
"learning_rate": 2.5555555555555557e-06,
"loss": -0.0016,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.034039564430713654,
"mask/share_reasoning": 0.8554302453994751,
"mask/share_step_conf": 0.10662389546632767,
"num_tokens": 25637424.0,
"reward": 0.7967321872711182,
"reward_std": 0.17280207574367523,
"rewards/accuracy_reward_step": 0.65625,
"rewards/final_brier_reward_step": 0.7668719291687012,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8265925049781799,
"step": 108
},
{
"adv/mean_abs_final_conf": 0.6863927841186523,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7505053281784058,
"adv/std_final_conf": 0.8743253350257874,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346165060997009,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.8137633970967304,
"calib/avg_num_step_conf": 5.29296875,
"calib/ece": 0.22839506172839505,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.5308641975308642,
"calib/gap": 0.38043345543345564,
"calib/mean_conf": 0.6843621399176956,
"calib/mu_c": 0.8816239316239318,
"calib/mu_w": 0.5011904761904762,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2156378600823045,
"calib/std_conf": 0.3631298338915262,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.6091071428571428,
"calib/step_q_c_n": 560.0,
"calib/step_q_gap": 0.22157255166217432,
"calib/step_q_w": 0.3875345911949685,
"calib/step_q_w_n": 795.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2714.0,
"completions/max_terminated_length": 2714.0,
"completions/mean_length": 557.12890625,
"completions/mean_terminated_length": 557.12890625,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.027278034016489983,
"kl": 0.09093475341796875,
"learning_rate": 2.5277777777777778e-06,
"loss": -0.1032,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.032528433948755264,
"mask/share_reasoning": 0.8628048896789551,
"mask/share_step_conf": 0.10466665029525757,
"num_tokens": 25884649.0,
"reward": 0.7528167963027954,
"reward_std": 0.2184036374092102,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.7128750085830688,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7927586436271667,
"step": 109
},
{
"adv/mean_abs_final_conf": 0.7272202968597412,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7695968151092529,
"adv/std_final_conf": 0.9092324376106262,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345746040344238,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6763870967741936,
"calib/avg_num_step_conf": 4.5390625,
"calib/ece": 0.26220883534136546,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5622489959839357,
"calib/gap": 0.2412077419354839,
"calib/mean_conf": 0.740120481927711,
"calib/mu_c": 0.86024,
"calib/mu_w": 0.6190322580645161,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.25016064257028114,
"calib/std_conf": 0.3194931753552225,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5981086142322097,
"calib/step_q_c_n": 534.0,
"calib/step_q_gap": 0.12742390085641359,
"calib/step_q_w": 0.47068471337579615,
"calib/step_q_w_n": 628.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1336.0,
"completions/max_terminated_length": 1336.0,
"completions/mean_length": 449.93359375,
"completions/mean_terminated_length": 451.69805908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.05075117200613022,
"kl": 0.104644775390625,
"learning_rate": 2.5e-06,
"loss": -0.1052,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.036121100187301636,
"mask/share_reasoning": 0.8499696850776672,
"mask/share_step_conf": 0.11000296473503113,
"num_tokens": 26104752.0,
"reward": 0.7489697933197021,
"reward_std": 0.19645348191261292,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6923683285713196,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8055711984634399,
"step": 110
},
{
"adv/mean_abs_final_conf": 0.6510974168777466,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7645963430404663,
"adv/std_final_conf": 0.861767590045929,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355725646018982,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7273393761663556,
"calib/avg_num_step_conf": 4.20703125,
"calib/ece": 0.2798367346938776,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.6204081632653061,
"calib/gap": 0.2111796854172222,
"calib/mean_conf": 0.778122448979592,
"calib/mu_c": 0.8824193548387098,
"calib/mu_w": 0.6712396694214876,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.27591836734693886,
"calib/std_conf": 0.31335144464864084,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5854054054054054,
"calib/step_q_c_n": 518.0,
"calib/step_q_gap": 0.11105835710486883,
"calib/step_q_w": 0.4743470483005366,
"calib/step_q_w_n": 559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2710.0,
"completions/max_terminated_length": 2710.0,
"completions/mean_length": 505.3671875,
"completions/mean_terminated_length": 505.3671875,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.1184,
"grad_norm": 0.029809826985001564,
"kl": 0.09429168701171875,
"learning_rate": 2.4722222222222226e-06,
"loss": -0.2014,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03598381206393242,
"mask/share_reasoning": 0.8623403906822205,
"mask/share_step_conf": 0.10167580842971802,
"num_tokens": 26341534.0,
"reward": 0.7034972310066223,
"reward_std": 0.23738251626491547,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6384176015853882,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.7685769200325012,
"step": 111
},
{
"adv/mean_abs_final_conf": 0.6813622713088989,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7548239231109619,
"adv/std_final_conf": 0.8669583797454834,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353672862052917,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.8494515108688336,
"calib/avg_num_step_conf": 4.671875,
"calib/ece": 0.16639344262295083,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.5163934426229508,
"calib/gap": 0.4569513426206338,
"calib/mean_conf": 0.6565573770491803,
"calib/mu_c": 0.8756692913385826,
"calib/mu_w": 0.4187179487179488,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.15122950819672132,
"calib/std_conf": 0.3894379524892531,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5153577371048254,
"calib/step_q_c_n": 601.0,
"calib/step_q_gap": 0.16880311525608582,
"calib/step_q_w": 0.34655462184873953,
"calib/step_q_w_n": 595.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2866.0,
"completions/max_terminated_length": 2866.0,
"completions/mean_length": 557.4453125,
"completions/mean_terminated_length": 559.6314086914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.4219500720500946,
"kl": 0.161285400390625,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0028,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.031249389052391052,
"mask/share_reasoning": 0.868740439414978,
"mask/share_step_conf": 0.09610390663146973,
"num_tokens": 26592160.0,
"reward": 0.778309166431427,
"reward_std": 0.21401192247867584,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7561488151550293,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8004695773124695,
"step": 112
},
{
"adv/mean_abs_final_conf": 0.731112003326416,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7389805316925049,
"adv/std_final_conf": 0.8750357627868652,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351128935813904,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7472007722007723,
"calib/avg_num_step_conf": 5.1796875,
"calib/ece": 0.23422310756972098,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6055776892430279,
"calib/gap": 0.2579607464607463,
"calib/mean_conf": 0.7602788844621514,
"calib/mu_c": 0.8743571428571428,
"calib/mu_w": 0.6163963963963965,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 1.0,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2183665338645417,
"calib/std_conf": 0.3249215687040707,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4958333333333333,
"calib/step_q_c_n": 684.0,
"calib/step_q_gap": 0.1295093457943925,
"calib/step_q_w": 0.3663239875389408,
"calib/step_q_w_n": 642.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 481.0234375,
"completions/mean_terminated_length": 481.0234375,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.02653811313211918,
"kl": 0.1146087646484375,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0389,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.0368918776512146,
"mask/share_reasoning": 0.8495938777923584,
"mask/share_step_conf": 0.11351422965526581,
"num_tokens": 26820502.0,
"reward": 0.774255633354187,
"reward_std": 0.21949368715286255,
"rewards/accuracy_reward_step": 0.546875,
"rewards/final_brier_reward_step": 0.7130206823348999,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8354905843734741,
"step": 113
},
{
"adv/mean_abs_final_conf": 0.6168874502182007,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7286229133605957,
"adv/std_final_conf": 0.8332869410514832,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345226287841797,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7605859923740718,
"calib/avg_num_step_conf": 5.49609375,
"calib/ece": 0.2521200000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.696,
"calib/gap": 0.2965569603317947,
"calib/mean_conf": 0.82508,
"calib/mu_c": 0.9425165562913906,
"calib/mu_w": 0.645959595959596,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.2366000000000001,
"calib/std_conf": 0.2983182756721418,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5084565499351491,
"calib/step_q_c_n": 771.0,
"calib/step_q_gap": 0.14667982037540062,
"calib/step_q_w": 0.3617767295597485,
"calib/step_q_w_n": 636.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2179.0,
"completions/max_terminated_length": 2179.0,
"completions/mean_length": 468.2265625,
"completions/mean_terminated_length": 473.7786865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.1216,
"grad_norm": 0.034657854586839676,
"kl": 0.11187744140625,
"learning_rate": 2.388888888888889e-06,
"loss": -0.0766,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03760165348649025,
"mask/share_reasoning": 0.8239021301269531,
"mask/share_step_conf": 0.12677747011184692,
"num_tokens": 27045392.0,
"reward": 0.7951518297195435,
"reward_std": 0.19239452481269836,
"rewards/accuracy_reward_step": 0.58984375,
"rewards/final_brier_reward_step": 0.7430340051651001,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8472696542739868,
"step": 114
},
{
"adv/mean_abs_final_conf": 0.738232433795929,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7586345672607422,
"adv/std_final_conf": 0.8943834900856018,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354433417320251,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.6038682864450128,
"calib/avg_num_step_conf": 5.109375,
"calib/ece": 0.3742231075697211,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.6573705179282868,
"calib/gap": 0.08126214833759571,
"calib/mean_conf": 0.8058565737051795,
"calib/mu_c": 0.8430882352941176,
"calib/mu_w": 0.7618260869565219,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3191235059760956,
"calib/std_conf": 0.3182127056584067,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.440656634746922,
"calib/step_q_c_n": 731.0,
"calib/step_q_gap": 0.028663567155934155,
"calib/step_q_w": 0.41199306759098786,
"calib/step_q_w_n": 577.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2734.0,
"completions/max_terminated_length": 2734.0,
"completions/mean_length": 457.87109375,
"completions/mean_terminated_length": 459.66668701171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.035833049565553665,
"kl": 0.11376953125,
"learning_rate": 2.361111111111111e-06,
"loss": -0.0914,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03642076253890991,
"mask/share_reasoning": 0.8400872349739075,
"mask/share_step_conf": 0.11958577483892441,
"num_tokens": 27267871.0,
"reward": 0.6914719343185425,
"reward_std": 0.21936823427677155,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.5942429304122925,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7887008190155029,
"step": 115
},
{
"adv/mean_abs_final_conf": 0.7463953495025635,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7141386866569519,
"adv/std_final_conf": 0.9021322131156921,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350417256355286,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7284679878048781,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.28270916334661356,
"calib/final_conf_rate": 0.98046875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5816733067729084,
"calib/gap": 0.2593025914634147,
"calib/mean_conf": 0.7619123505976096,
"calib/mu_c": 0.8941463414634148,
"calib/mu_w": 0.63484375,
"calib/nonempty_final_conf_rate": 0.98046875,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.2772908366533865,
"calib/std_conf": 0.32402278504134363,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.43413078149920264,
"calib/step_q_c_n": 627.0,
"calib/step_q_gap": 0.07578913561391587,
"calib/step_q_w": 0.3583416458852868,
"calib/step_q_w_n": 802.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1816.0,
"completions/max_terminated_length": 1816.0,
"completions/mean_length": 516.63671875,
"completions/mean_terminated_length": 518.6627807617188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.03071327693760395,
"kl": 0.10791015625,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.1317,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.032803844660520554,
"mask/share_reasoning": 0.8485397100448608,
"mask/share_step_conf": 0.11475016176700592,
"num_tokens": 27504650.0,
"reward": 0.7508207559585571,
"reward_std": 0.2131670117378235,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.676442563533783,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8251989483833313,
"step": 116
},
{
"adv/mean_abs_final_conf": 0.7544440031051636,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7447916269302368,
"adv/std_final_conf": 0.9069930911064148,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348868131637573,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7259875541125542,
"calib/avg_num_step_conf": 4.9453125,
"calib/ece": 0.38098360655737695,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5368852459016393,
"calib/gap": 0.1711742424242425,
"calib/mean_conf": 0.7961475409836066,
"calib/mu_c": 0.88875,
"calib/mu_w": 0.7175757575757575,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.3590573770491802,
"calib/std_conf": 0.2844391857019804,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.44404444444444446,
"calib/step_q_c_n": 495.0,
"calib/step_q_gap": 0.08029606571552095,
"calib/step_q_w": 0.3637483787289235,
"calib/step_q_w_n": 771.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2486.0,
"completions/max_terminated_length": 2486.0,
"completions/mean_length": 492.6796875,
"completions/mean_terminated_length": 494.6117858886719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.1248,
"grad_norm": 0.03247150406241417,
"kl": 0.11163330078125,
"learning_rate": 2.305555555555556e-06,
"loss": -0.1601,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03434731066226959,
"mask/share_reasoning": 0.847392737865448,
"mask/share_step_conf": 0.11435367912054062,
"num_tokens": 27737376.0,
"reward": 0.705039381980896,
"reward_std": 0.21480512619018555,
"rewards/accuracy_reward_step": 0.4375,
"rewards/final_brier_reward_step": 0.607038676738739,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8030400276184082,
"step": 117
},
{
"adv/mean_abs_final_conf": 0.7744559049606323,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7268549799919128,
"adv/std_final_conf": 0.9298220872879028,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353885650634766,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6918957284810944,
"calib/avg_num_step_conf": 5.73828125,
"calib/ece": 0.3121602409638554,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5622489959839357,
"calib/gap": 0.18730857530003875,
"calib/mean_conf": 0.7585224899598393,
"calib/mu_c": 0.8510484126984126,
"calib/mu_w": 0.6637398373983738,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.2823293172690763,
"calib/std_conf": 0.3323967390102428,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4162891207153503,
"calib/step_q_c_n": 671.0,
"calib/step_q_gap": 0.07127658938702952,
"calib/step_q_w": 0.34501253132832077,
"calib/step_q_w_n": 798.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2267.0,
"completions/max_terminated_length": 2267.0,
"completions/mean_length": 512.6796875,
"completions/mean_terminated_length": 518.7589111328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.022973336279392242,
"kl": 0.111297607421875,
"learning_rate": 2.277777777777778e-06,
"loss": -0.1643,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03558062016963959,
"mask/share_reasoning": 0.8346555829048157,
"mask/share_step_conf": 0.11804504692554474,
"num_tokens": 27972630.0,
"reward": 0.7236074209213257,
"reward_std": 0.2213224172592163,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6395763754844666,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8076384663581848,
"step": 118
},
{
"adv/mean_abs_final_conf": 0.7573567628860474,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7542221546173096,
"adv/std_final_conf": 0.9039705395698547,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353067874908447,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7770766559829061,
"calib/avg_num_step_conf": 4.984375,
"calib/ece": 0.27346122448979593,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.5306122448979592,
"calib/gap": 0.32107946047008573,
"calib/mean_conf": 0.6977224489795919,
"calib/mu_c": 0.8654700854700856,
"calib/mu_w": 0.5443906249999999,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.2468163265306123,
"calib/std_conf": 0.3755614742284074,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.43715547703180213,
"calib/step_q_c_n": 566.0,
"calib/step_q_gap": 0.11232449111630916,
"calib/step_q_w": 0.324830985915493,
"calib/step_q_w_n": 710.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2908.0,
"completions/max_terminated_length": 2908.0,
"completions/mean_length": 540.078125,
"completions/mean_terminated_length": 542.1961059570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.020019669085741043,
"kl": 0.11639404296875,
"learning_rate": 2.25e-06,
"loss": -0.0749,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03293025493621826,
"mask/share_reasoning": 0.8575558662414551,
"mask/share_step_conf": 0.10560765862464905,
"num_tokens": 28215954.0,
"reward": 0.7540497183799744,
"reward_std": 0.22805644571781158,
"rewards/accuracy_reward_step": 0.45703125,
"rewards/final_brier_reward_step": 0.6844784021377563,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8236210346221924,
"step": 119
},
{
"adv/mean_abs_final_conf": 0.7578812837600708,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7718867063522339,
"adv/std_final_conf": 0.9244391322135925,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350153207778931,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7429498122226015,
"calib/avg_num_step_conf": 5.09375,
"calib/ece": 0.20085365853658543,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.508130081300813,
"calib/gap": 0.3276824854899283,
"calib/mean_conf": 0.6976016260162602,
"calib/mu_c": 0.8321379310344827,
"calib/mu_w": 0.5044554455445545,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1545121951219513,
"calib/std_conf": 0.36518780645182547,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.41482662968099865,
"calib/step_q_c_n": 721.0,
"calib/step_q_gap": 0.11378031750261097,
"calib/step_q_w": 0.3010463121783877,
"calib/step_q_w_n": 583.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2719.0,
"completions/max_terminated_length": 2719.0,
"completions/mean_length": 495.9375,
"completions/mean_terminated_length": 495.9375,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.128,
"grad_norm": 0.021387485787272453,
"kl": 0.1209259033203125,
"learning_rate": 2.222222222222222e-06,
"loss": -0.0694,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.0344981849193573,
"mask/share_reasoning": 0.8499711751937866,
"mask/share_step_conf": 0.11553068459033966,
"num_tokens": 28449602.0,
"reward": 0.7760567665100098,
"reward_std": 0.21278806030750275,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7316077947616577,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.820505678653717,
"step": 120
},
{
"adv/mean_abs_final_conf": 0.7948448657989502,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.77760249376297,
"adv/std_final_conf": 0.9363829493522644,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354314804077148,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7047554347826087,
"calib/avg_num_step_conf": 5.75390625,
"calib/ece": 0.2051028806584362,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.3991769547325103,
"calib/gap": 0.256281929347826,
"calib/mean_conf": 0.671604938271605,
"calib/mu_c": 0.792890625,
"calib/mu_w": 0.536608695652174,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.17497942386831278,
"calib/std_conf": 0.34387571969084624,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.3972486686390532,
"calib/step_q_c_n": 676.0,
"calib/step_q_gap": 0.06644440264156265,
"calib/step_q_w": 0.3308042659974906,
"calib/step_q_w_n": 797.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2666.0,
"completions/max_terminated_length": 2666.0,
"completions/mean_length": 556.609375,
"completions/mean_terminated_length": 563.2095336914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.02432946488261223,
"kl": 0.1153717041015625,
"learning_rate": 2.1944444444444445e-06,
"loss": -0.0982,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.030991578474640846,
"mask/share_reasoning": 0.848613440990448,
"mask/share_step_conf": 0.1086762472987175,
"num_tokens": 28697150.0,
"reward": 0.742145836353302,
"reward_std": 0.22360759973526,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6939094066619873,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.7903823852539062,
"step": 121
},
{
"adv/mean_abs_final_conf": 0.7506898641586304,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7413808107376099,
"adv/std_final_conf": 0.9135042428970337,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934847354888916,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7421900799154613,
"calib/avg_num_step_conf": 5.71875,
"calib/ece": 0.1756000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.456,
"calib/gap": 0.3090033683376263,
"calib/mean_conf": 0.68616,
"calib/mu_c": 0.8134693877551021,
"calib/mu_w": 0.5044660194174758,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1368800000000001,
"calib/std_conf": 0.35813245370951796,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4297733333333334,
"calib/step_q_c_n": 750.0,
"calib/step_q_gap": 0.1313027450980393,
"calib/step_q_w": 0.2984705882352941,
"calib/step_q_w_n": 714.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2893.0,
"completions/max_terminated_length": 2893.0,
"completions/mean_length": 505.79296875,
"completions/mean_terminated_length": 507.7764892578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.019587241113185883,
"kl": 0.1252899169921875,
"learning_rate": 2.166666666666667e-06,
"loss": -0.0778,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03283946216106415,
"mask/share_reasoning": 0.8448941707611084,
"mask/share_step_conf": 0.11836010217666626,
"num_tokens": 28933977.0,
"reward": 0.785601019859314,
"reward_std": 0.19185268878936768,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7401956915855408,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8310062885284424,
"step": 122
},
{
"adv/mean_abs_final_conf": 0.7581375241279602,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.749475359916687,
"adv/std_final_conf": 0.9185908436775208,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353261590003967,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.687900826446281,
"calib/avg_num_step_conf": 5.7109375,
"calib/ece": 0.21955284552845528,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.3252032520325203,
"calib/gap": 0.21208528925619852,
"calib/mean_conf": 0.6276016260162601,
"calib/mu_c": 0.7319200000000001,
"calib/mu_w": 0.5198347107438016,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.16951219512195123,
"calib/std_conf": 0.34155124934966263,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.367410071942446,
"calib/step_q_c_n": 695.0,
"calib/step_q_gap": 0.0253109845891214,
"calib/step_q_w": 0.3420990873533246,
"calib/step_q_w_n": 767.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2731.0,
"completions/max_terminated_length": 2731.0,
"completions/mean_length": 570.0,
"completions/mean_terminated_length": 574.4881591796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.1312,
"grad_norm": 0.022155309095978737,
"kl": 0.11624908447265625,
"learning_rate": 2.138888888888889e-06,
"loss": -0.078,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.029910586774349213,
"mask/share_reasoning": 0.853052020072937,
"mask/share_step_conf": 0.10922486335039139,
"num_tokens": 29185185.0,
"reward": 0.7430111765861511,
"reward_std": 0.1918821930885315,
"rewards/accuracy_reward_step": 0.4921875,
"rewards/final_brier_reward_step": 0.6940113306045532,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.792011022567749,
"step": 123
},
{
"adv/mean_abs_final_conf": 0.7517533302307129,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7596127986907959,
"adv/std_final_conf": 0.9228248596191406,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353944659233093,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7227831715210356,
"calib/avg_num_step_conf": 5.00390625,
"calib/ece": 0.16616877470355734,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.4031620553359684,
"calib/gap": 0.27088122977346263,
"calib/mean_conf": 0.6841869565217391,
"calib/mu_c": 0.7944666666666665,
"calib/mu_w": 0.5235854368932039,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.12873517786561267,
"calib/std_conf": 0.33946601899337664,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.42114845938375356,
"calib/step_q_c_n": 714.0,
"calib/step_q_gap": 0.07587509077705162,
"calib/step_q_w": 0.34527336860670194,
"calib/step_q_w_n": 567.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2269.0,
"completions/max_terminated_length": 2269.0,
"completions/mean_length": 493.35546875,
"completions/mean_terminated_length": 493.35546875,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.03565501794219017,
"kl": 0.12664794921875,
"learning_rate": 2.1111111111111114e-06,
"loss": -0.0886,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.03346759080886841,
"mask/share_reasoning": 0.8561069369316101,
"mask/share_step_conf": 0.11042545735836029,
"num_tokens": 29418300.0,
"reward": 0.7763141989707947,
"reward_std": 0.18388475477695465,
"rewards/accuracy_reward_step": 0.5859375,
"rewards/final_brier_reward_step": 0.7414895296096802,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8111388683319092,
"step": 124
},
{
"adv/mean_abs_final_conf": 0.7748832106590271,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7449538707733154,
"adv/std_final_conf": 0.9294499158859253,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354871511459351,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.6227737905369484,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.2821544715447155,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.3617886178861789,
"calib/gap": 0.1516347687400318,
"calib/mean_conf": 0.6411788617886177,
"calib/mu_c": 0.7225438596491228,
"calib/mu_w": 0.570909090909091,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.229959349593496,
"calib/std_conf": 0.35106549854496266,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4089411764705882,
"calib/step_q_c_n": 595.0,
"calib/step_q_gap": 0.04208439573548056,
"calib/step_q_w": 0.36685678073510763,
"calib/step_q_w_n": 789.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2594.0,
"completions/max_terminated_length": 2594.0,
"completions/mean_length": 528.625,
"completions/mean_terminated_length": 530.6980590820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.024459734559059143,
"kl": 0.11830902099609375,
"learning_rate": 2.0833333333333334e-06,
"loss": -0.0704,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03457402065396309,
"mask/share_reasoning": 0.8480210304260254,
"mask/share_step_conf": 0.11349868029356003,
"num_tokens": 29658436.0,
"reward": 0.7104007005691528,
"reward_std": 0.21995657682418823,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.6415195465087891,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.7792819142341614,
"step": 125
},
{
"adv/mean_abs_final_conf": 0.744687557220459,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7507373094558716,
"adv/std_final_conf": 0.9142650961875916,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354282021522522,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7776556044248968,
"calib/avg_num_step_conf": 5.7734375,
"calib/ece": 0.23142857142857154,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4326530612244898,
"calib/gap": 0.34006064240970274,
"calib/mean_conf": 0.636734693877551,
"calib/mu_c": 0.8074590163934426,
"calib/mu_w": 0.4673983739837399,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.18510204081632667,
"calib/std_conf": 0.38000530477018046,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.4600654664484452,
"calib/step_q_c_n": 611.0,
"calib/step_q_gap": 0.1603538170828166,
"calib/step_q_w": 0.2997116493656286,
"calib/step_q_w_n": 867.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3069.0,
"completions/max_terminated_length": 3069.0,
"completions/mean_length": 544.24609375,
"completions/mean_terminated_length": 544.24609375,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.1344,
"grad_norm": 0.024065330624580383,
"kl": 0.1114044189453125,
"learning_rate": 2.0555555555555555e-06,
"loss": -0.0088,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03621228411793709,
"mask/share_reasoning": 0.8417133092880249,
"mask/share_step_conf": 0.12207438796758652,
"num_tokens": 29903227.0,
"reward": 0.7633358836174011,
"reward_std": 0.1952163577079773,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7203140258789062,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.806357741355896,
"step": 126
},
{
"adv/mean_abs_final_conf": 0.7594637274742126,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7706195116043091,
"adv/std_final_conf": 0.9249746203422546,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354552030563354,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7582298786441624,
"calib/avg_num_step_conf": 6.2578125,
"calib/ece": 0.20667634854771783,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.31950207468879666,
"calib/gap": 0.3310576091505092,
"calib/mean_conf": 0.5978049792531119,
"calib/mu_c": 0.7818785046728972,
"calib/mu_w": 0.450820895522388,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.18024896265560164,
"calib/std_conf": 0.36942244838314564,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.45648788927335643,
"calib/step_q_c_n": 578.0,
"calib/step_q_gap": 0.16179745958585645,
"calib/step_q_w": 0.2946904296875,
"calib/step_q_w_n": 1024.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2815.0,
"completions/max_terminated_length": 2815.0,
"completions/mean_length": 562.96484375,
"completions/mean_terminated_length": 565.172607421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.022668741643428802,
"kl": 0.118194580078125,
"learning_rate": 2.027777777777778e-06,
"loss": -0.0401,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.03429264575242996,
"mask/share_reasoning": 0.8373519778251648,
"mask/share_step_conf": 0.12444911152124405,
"num_tokens": 30151018.0,
"reward": 0.7562384009361267,
"reward_std": 0.21813994646072388,
"rewards/accuracy_reward_step": 0.41796875,
"rewards/final_brier_reward_step": 0.7046475410461426,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.8078293204307556,
"step": 127
},
{
"adv/mean_abs_final_conf": 0.7670286893844604,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7674146294593811,
"adv/std_final_conf": 0.9218215942382812,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352871775627136,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.7272982456140351,
"calib/avg_num_step_conf": 4.7734375,
"calib/ece": 0.2316849372384937,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.35564853556485354,
"calib/gap": 0.2875707017543861,
"calib/mean_conf": 0.6113276150627615,
"calib/mu_c": 0.7617307017543861,
"calib/mu_w": 0.47415999999999997,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.18301255230125524,
"calib/std_conf": 0.3756346130230901,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.4362128731343283,
"calib/step_q_c_n": 536.0,
"calib/step_q_gap": 0.10781972444628168,
"calib/step_q_w": 0.32839314868804664,
"calib/step_q_w_n": 686.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2969.0,
"completions/max_terminated_length": 2969.0,
"completions/mean_length": 537.8359375,
"completions/mean_terminated_length": 539.9451293945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.02352401241660118,
"kl": 0.118560791015625,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.114,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.035591743886470795,
"mask/share_reasoning": 0.8582073450088501,
"mask/share_step_conf": 0.10229466110467911,
"num_tokens": 30395368.0,
"reward": 0.7304539680480957,
"reward_std": 0.22143486142158508,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.681101381778717,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.7798066139221191,
"step": 128
},
{
"adv/mean_abs_final_conf": 0.7487454414367676,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7624181509017944,
"adv/std_final_conf": 0.9342681169509888,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351850748062134,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.6731820493262142,
"calib/avg_num_step_conf": 5.31640625,
"calib/ece": 0.23396785714285712,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.39285714285714285,
"calib/gap": 0.2077184210526316,
"calib/mean_conf": 0.6626988095238094,
"calib/mu_c": 0.7566666666666667,
"calib/mu_w": 0.5489482456140351,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.17452380952380953,
"calib/std_conf": 0.3514597368484289,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.43732620320855614,
"calib/step_q_c_n": 748.0,
"calib/step_q_gap": 0.055645942197136866,
"calib/step_q_w": 0.3816802610114193,
"calib/step_q_w_n": 613.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1535.0,
"completions/max_terminated_length": 1535.0,
"completions/mean_length": 462.25,
"completions/mean_terminated_length": 464.0627746582031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.1376,
"grad_norm": 0.0282196793705225,
"kl": 0.1335296630859375,
"learning_rate": 1.9722222222222224e-06,
"loss": -0.1281,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.03705020248889923,
"mask/share_reasoning": 0.8317481875419617,
"mask/share_step_conf": 0.1272953748703003,
"num_tokens": 30616088.0,
"reward": 0.7445838451385498,
"reward_std": 0.18311186134815216,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.6939468383789062,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7952207326889038,
"step": 129
},
{
"adv/mean_abs_final_conf": 0.7417335510253906,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7767922282218933,
"adv/std_final_conf": 0.8977218866348267,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350262880325317,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7888799355358582,
"calib/avg_num_step_conf": 4.875,
"calib/ece": 0.17321236559139783,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4717741935483871,
"calib/gap": 0.33700913242009123,
"calib/mean_conf": 0.6984005376344086,
"calib/mu_c": 0.8370091324200913,
"calib/mu_w": 0.5000000000000001,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.1414516129032258,
"calib/std_conf": 0.3465609264910288,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.47448717948717956,
"calib/step_q_c_n": 702.0,
"calib/step_q_gap": 0.12732600732600735,
"calib/step_q_w": 0.3471611721611722,
"calib/step_q_w_n": 546.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2686.0,
"completions/max_terminated_length": 2686.0,
"completions/mean_length": 461.69140625,
"completions/mean_terminated_length": 461.69140625,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.02844446338713169,
"kl": 0.129486083984375,
"learning_rate": 1.944444444444445e-06,
"loss": -0.0903,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03700219839811325,
"mask/share_reasoning": 0.8433710336685181,
"mask/share_step_conf": 0.11962681263685226,
"num_tokens": 30839569.0,
"reward": 0.7797413468360901,
"reward_std": 0.1676107943058014,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7607482671737671,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.7987344264984131,
"step": 130
},
{
"adv/mean_abs_final_conf": 0.7331950664520264,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.751865029335022,
"adv/std_final_conf": 0.9233747124671936,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350845813751221,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7598986486486486,
"calib/avg_num_step_conf": 5.07421875,
"calib/ece": 0.20532258064516135,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.2661290322580645,
"calib/gap": 0.32641621621621636,
"calib/mean_conf": 0.5329032258064516,
"calib/mu_c": 0.7277000000000001,
"calib/mu_w": 0.40128378378378377,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1675000000000001,
"calib/std_conf": 0.3696538795072432,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4218803418803419,
"calib/step_q_c_n": 468.0,
"calib/step_q_gap": 0.10694652719923481,
"calib/step_q_w": 0.3149338146811071,
"calib/step_q_w_n": 831.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2406.0,
"completions/max_terminated_length": 2406.0,
"completions/mean_length": 481.13671875,
"completions/mean_terminated_length": 483.0235595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.02161959744989872,
"kl": 0.125335693359375,
"learning_rate": 1.916666666666667e-06,
"loss": -0.0873,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.034892257302999496,
"mask/share_reasoning": 0.8474602699279785,
"mask/share_step_conf": 0.11374115198850632,
"num_tokens": 31068948.0,
"reward": 0.7788792848587036,
"reward_std": 0.17850662767887115,
"rewards/accuracy_reward_step": 0.390625,
"rewards/final_brier_reward_step": 0.7352492213249207,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8225093483924866,
"step": 131
},
{
"adv/mean_abs_final_conf": 0.7242047190666199,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7435998916625977,
"adv/std_final_conf": 0.8903018832206726,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353998303413391,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7301073021181717,
"calib/avg_num_step_conf": 5.31640625,
"calib/ece": 0.24270161290322584,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.4959677419354839,
"calib/gap": 0.24520345596432558,
"calib/mean_conf": 0.6760887096774193,
"calib/mu_c": 0.767051282051282,
"calib/mu_w": 0.5218478260869565,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.14487903225806456,
"calib/std_conf": 0.379004990010915,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.48695418848167543,
"calib/step_q_c_n": 764.0,
"calib/step_q_gap": 0.11089053689038564,
"calib/step_q_w": 0.3760636515912898,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2574.0,
"completions/max_terminated_length": 2574.0,
"completions/mean_length": 516.24609375,
"completions/mean_terminated_length": 518.2706298828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.1408,
"grad_norm": 0.030215511098504066,
"kl": 0.116729736328125,
"learning_rate": 1.888888888888889e-06,
"loss": -0.0388,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03585230931639671,
"mask/share_reasoning": 0.8398134708404541,
"mask/share_step_conf": 0.12042798101902008,
"num_tokens": 31306699.0,
"reward": 0.7527471780776978,
"reward_std": 0.21779859066009521,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.6969640254974365,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8085302114486694,
"step": 132
},
{
"adv/mean_abs_final_conf": 0.7842279672622681,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7547422647476196,
"adv/std_final_conf": 0.9241669774055481,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354990124702454,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7024958632101489,
"calib/avg_num_step_conf": 5.7734375,
"calib/ece": 0.2579268292682927,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.34552845528455284,
"calib/gap": 0.2425606729178158,
"calib/mean_conf": 0.5730487804878048,
"calib/mu_c": 0.7189795918367347,
"calib/mu_w": 0.47641891891891885,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.21630081300813012,
"calib/std_conf": 0.375989823285099,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.43455341506129597,
"calib/step_q_c_n": 571.0,
"calib/step_q_gap": 0.06357215817044698,
"calib/step_q_w": 0.370981256890849,
"calib/step_q_w_n": 907.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2612.0,
"completions/max_terminated_length": 2612.0,
"completions/mean_length": 562.07421875,
"completions/mean_terminated_length": 566.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.03317323327064514,
"kl": 0.11187744140625,
"learning_rate": 1.8611111111111113e-06,
"loss": -0.0778,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03122769482433796,
"mask/share_reasoning": 0.855074405670166,
"mask/share_step_conf": 0.10588540881872177,
"num_tokens": 31556934.0,
"reward": 0.7412779927253723,
"reward_std": 0.22184857726097107,
"rewards/accuracy_reward_step": 0.3828125,
"rewards/final_brier_reward_step": 0.6710578203201294,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8114981651306152,
"step": 133
},
{
"adv/mean_abs_final_conf": 0.7846685647964478,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7685303688049316,
"adv/std_final_conf": 0.9300345182418823,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356147646903992,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.6804602630074328,
"calib/avg_num_step_conf": 4.36328125,
"calib/ece": 0.2869747899159665,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.8984375,
"calib/frac_conf_gt_0.9": 0.38235294117647056,
"calib/gap": 0.22956975414522585,
"calib/mean_conf": 0.5912605042016807,
"calib/mu_c": 0.7185849056603775,
"calib/mu_w": 0.4890151515151516,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.92578125,
"calib/pce": 0.21642857142857153,
"calib/std_conf": 0.38636325186236553,
"calib/step_conf_rate": 0.92578125,
"calib/step_q_c": 0.4460961538461538,
"calib/step_q_c_n": 520.0,
"calib/step_q_gap": 0.08304757763174841,
"calib/step_q_w": 0.3630485762144054,
"calib/step_q_w_n": 597.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2980.0,
"completions/max_terminated_length": 2980.0,
"completions/mean_length": 551.98828125,
"completions/mean_terminated_length": 554.1529541015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.02674867957830429,
"kl": 0.10691070556640625,
"learning_rate": 1.8333333333333333e-06,
"loss": -0.1443,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.030389247462153435,
"mask/share_reasoning": 0.8766155242919922,
"mask/share_step_conf": 0.08908900618553162,
"num_tokens": 31807195.0,
"reward": 0.6885699033737183,
"reward_std": 0.25885897874832153,
"rewards/accuracy_reward_step": 0.4140625,
"rewards/final_brier_reward_step": 0.6310105323791504,
"rewards/format_reward_step": 0.8984375,
"rewards/step_l2_reward": 0.7461292743682861,
"step": 134
},
{
"adv/mean_abs_final_conf": 0.7832664251327515,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7556527853012085,
"adv/std_final_conf": 0.9360292553901672,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9355044364929199,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6229491525423729,
"calib/avg_num_step_conf": 5.84765625,
"calib/ece": 0.2741316872427983,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.37448559670781895,
"calib/gap": 0.14562169491525412,
"calib/mean_conf": 0.6468065843621398,
"calib/mu_c": 0.7175199999999998,
"calib/mu_w": 0.5718983050847457,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.20326748971193412,
"calib/std_conf": 0.3520078674150313,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.4287592319054653,
"calib/step_q_c_n": 677.0,
"calib/step_q_gap": 0.05091776849083113,
"calib/step_q_w": 0.37784146341463415,
"calib/step_q_w_n": 820.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2560.0,
"completions/max_terminated_length": 2560.0,
"completions/mean_length": 583.97265625,
"completions/mean_terminated_length": 588.5708618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.144,
"grad_norm": 0.01977521926164627,
"kl": 0.1051788330078125,
"learning_rate": 1.8055555555555557e-06,
"loss": -0.1116,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.032413728535175323,
"mask/share_reasoning": 0.8518058061599731,
"mask/share_step_conf": 0.10796800255775452,
"num_tokens": 32062572.0,
"reward": 0.7115935683250427,
"reward_std": 0.21959969401359558,
"rewards/accuracy_reward_step": 0.48828125,
"rewards/final_brier_reward_step": 0.6358980536460876,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7872891426086426,
"step": 135
},
{
"adv/mean_abs_final_conf": 0.7137893438339233,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7463816404342651,
"adv/std_final_conf": 0.9127443432807922,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350497126579285,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8045137736475274,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.17386178861788626,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.35365853658536583,
"calib/gap": 0.4000192499170262,
"calib/mean_conf": 0.5577642276422763,
"calib/mu_c": 0.7707826086956522,
"calib/mu_w": 0.370763358778626,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.13207317073170738,
"calib/std_conf": 0.3906034019080313,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4524784853700516,
"calib/step_q_c_n": 581.0,
"calib/step_q_gap": 0.1576671646153346,
"calib/step_q_w": 0.294811320754717,
"calib/step_q_w_n": 848.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2994.0,
"completions/max_terminated_length": 2994.0,
"completions/mean_length": 508.5703125,
"completions/mean_terminated_length": 512.5748291015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.021196873858571053,
"kl": 0.1201629638671875,
"learning_rate": 1.777777777777778e-06,
"loss": -0.1318,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03404550999403,
"mask/share_reasoning": 0.8395554423332214,
"mask/share_step_conf": 0.11858654022216797,
"num_tokens": 32301254.0,
"reward": 0.7984526753425598,
"reward_std": 0.20844683051109314,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7547625303268433,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8421428203582764,
"step": 136
},
{
"adv/mean_abs_final_conf": 0.7769197821617126,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7149304151535034,
"adv/std_final_conf": 0.9298633337020874,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9358368515968323,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6837907037815125,
"calib/avg_num_step_conf": 5.7421875,
"calib/ece": 0.23072874493927123,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.31983805668016196,
"calib/gap": 0.208855698529412,
"calib/mean_conf": 0.5788259109311741,
"calib/mu_c": 0.6870588235294119,
"calib/mu_w": 0.47820312499999995,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.16388663967611333,
"calib/std_conf": 0.36934321570666856,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.42313432835820897,
"calib/step_q_c_n": 670.0,
"calib/step_q_gap": 0.05207182835820895,
"calib/step_q_w": 0.3710625,
"calib/step_q_w_n": 800.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1876.0,
"completions/max_terminated_length": 1876.0,
"completions/mean_length": 526.03515625,
"completions/mean_terminated_length": 528.0980834960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.024452555924654007,
"kl": 0.1158905029296875,
"learning_rate": 1.75e-06,
"loss": -0.1716,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03136240318417549,
"mask/share_reasoning": 0.8453494310379028,
"mask/share_step_conf": 0.11938194930553436,
"num_tokens": 32542903.0,
"reward": 0.7437588572502136,
"reward_std": 0.24398964643478394,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.6762160062789917,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8113017082214355,
"step": 137
},
{
"adv/mean_abs_final_conf": 0.7433382868766785,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7487839460372925,
"adv/std_final_conf": 0.9363875389099121,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9357391595840454,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.7615031757233592,
"calib/avg_num_step_conf": 4.921875,
"calib/ece": 0.17493723849372372,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.91015625,
"calib/frac_conf_gt_0.9": 0.40585774058577406,
"calib/gap": 0.3412371206774876,
"calib/mean_conf": 0.624142259414226,
"calib/mu_c": 0.7797692307692308,
"calib/mu_w": 0.43853211009174314,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.93359375,
"calib/pce": 0.12757322175732205,
"calib/std_conf": 0.374984513840238,
"calib/step_conf_rate": 0.93359375,
"calib/step_q_c": 0.48342105263157886,
"calib/step_q_c_n": 608.0,
"calib/step_q_gap": 0.1258750403616402,
"calib/step_q_w": 0.35754601226993865,
"calib/step_q_w_n": 652.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2417.0,
"completions/max_terminated_length": 2417.0,
"completions/mean_length": 513.0859375,
"completions/mean_terminated_length": 513.0859375,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.1472,
"grad_norm": 0.024063071236014366,
"kl": 0.1184539794921875,
"learning_rate": 1.7222222222222224e-06,
"loss": -0.1174,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.03499140590429306,
"mask/share_reasoning": 0.856407105922699,
"mask/share_step_conf": 0.10860144346952438,
"num_tokens": 32778589.0,
"reward": 0.7396991848945618,
"reward_std": 0.25229695439338684,
"rewards/accuracy_reward_step": 0.5078125,
"rewards/final_brier_reward_step": 0.7052828073501587,
"rewards/format_reward_step": 0.91015625,
"rewards/step_l2_reward": 0.7741155624389648,
"step": 138
},
{
"adv/mean_abs_final_conf": 0.774564266204834,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7389470338821411,
"adv/std_final_conf": 0.9210377931594849,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353828430175781,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7207237965175828,
"calib/avg_num_step_conf": 4.7578125,
"calib/ece": 0.18032520325203244,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.35365853658536583,
"calib/gap": 0.2964711505633322,
"calib/mean_conf": 0.5891056910569107,
"calib/mu_c": 0.7108275862068965,
"calib/mu_w": 0.4143564356435643,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.08999999999999994,
"calib/std_conf": 0.3777726940126769,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.45176470588235296,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.08779157728350462,
"calib/step_q_w": 0.36397312859884834,
"calib/step_q_w_n": 521.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2171.0,
"completions/max_terminated_length": 2171.0,
"completions/mean_length": 455.25390625,
"completions/mean_terminated_length": 455.25390625,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.02282751351594925,
"kl": 0.133056640625,
"learning_rate": 1.6944444444444446e-06,
"loss": -0.0957,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03690766170620918,
"mask/share_reasoning": 0.8504753112792969,
"mask/share_step_conf": 0.11261705309152603,
"num_tokens": 32998230.0,
"reward": 0.7724387645721436,
"reward_std": 0.24210603535175323,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.722465991973877,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8224115371704102,
"step": 139
},
{
"adv/mean_abs_final_conf": 0.743836522102356,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.766912043094635,
"adv/std_final_conf": 0.9190491437911987,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348379373550415,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7683735837805605,
"calib/avg_num_step_conf": 4.84765625,
"calib/ece": 0.1443388429752065,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.5165289256198347,
"calib/gap": 0.3545706618962433,
"calib/mean_conf": 0.7193801652892562,
"calib/mu_c": 0.8453846153846154,
"calib/mu_w": 0.4908139534883721,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.10954545454545442,
"calib/std_conf": 0.3436460257467601,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5234966887417218,
"calib/step_q_c_n": 755.0,
"calib/step_q_gap": 0.16304401384460254,
"calib/step_q_w": 0.3604526748971193,
"calib/step_q_w_n": 486.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2576.0,
"completions/max_terminated_length": 2576.0,
"completions/mean_length": 475.21484375,
"completions/mean_terminated_length": 478.9566955566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.03636842966079712,
"kl": 0.220977783203125,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.1199,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.035617418587207794,
"mask/share_reasoning": 0.8433914184570312,
"mask/share_step_conf": 0.11317867785692215,
"num_tokens": 33224901.0,
"reward": 0.7863426804542542,
"reward_std": 0.22152948379516602,
"rewards/accuracy_reward_step": 0.609375,
"rewards/final_brier_reward_step": 0.7550433874130249,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.8176419734954834,
"step": 140
},
{
"adv/mean_abs_final_conf": 0.7288170456886292,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7470444440841675,
"adv/std_final_conf": 0.9097040295600891,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351239800453186,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8300006871435442,
"calib/avg_num_step_conf": 4.96875,
"calib/ece": 0.1373306233062331,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.45121951219512196,
"calib/gap": 0.4221253349824779,
"calib/mean_conf": 0.6560840108401084,
"calib/mu_c": 0.8259637188208617,
"calib/mu_w": 0.40383838383838383,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.09792682926829271,
"calib/std_conf": 0.3738567930432718,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5041971830985916,
"calib/step_q_c_n": 710.0,
"calib/step_q_gap": 0.15293158595064155,
"calib/step_q_w": 0.3512655971479501,
"calib/step_q_w_n": 561.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 527.9921875,
"completions/mean_terminated_length": 527.9921875,
"completions/min_length": 111.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.1504,
"grad_norm": 0.027496205642819405,
"kl": 0.1147613525390625,
"learning_rate": 1.638888888888889e-06,
"loss": -0.0659,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.033179208636283875,
"mask/share_reasoning": 0.8603793382644653,
"mask/share_step_conf": 0.10644148290157318,
"num_tokens": 33467163.0,
"reward": 0.8010966777801514,
"reward_std": 0.2073913812637329,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7743968963623047,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.827796459197998,
"step": 141
},
{
"adv/mean_abs_final_conf": 0.7331016659736633,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7484983205795288,
"adv/std_final_conf": 0.912481963634491,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935049295425415,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8432870679300682,
"calib/avg_num_step_conf": 5.546875,
"calib/ece": 0.1430204081632654,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.3469387755102041,
"calib/gap": 0.45674829841185105,
"calib/mean_conf": 0.5617959183673469,
"calib/mu_c": 0.7985593220338982,
"calib/mu_w": 0.3418110236220472,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.11159183673469397,
"calib/std_conf": 0.3909350893242674,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5323050847457627,
"calib/step_q_c_n": 590.0,
"calib/step_q_gap": 0.19322074739636502,
"calib/step_q_w": 0.33908433734939764,
"calib/step_q_w_n": 830.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 540.51171875,
"completions/mean_terminated_length": 540.51171875,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.03245364874601364,
"kl": 0.1150970458984375,
"learning_rate": 1.6111111111111113e-06,
"loss": -0.0592,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03268226236104965,
"mask/share_reasoning": 0.853368878364563,
"mask/share_step_conf": 0.11394885182380676,
"num_tokens": 33710694.0,
"reward": 0.8005416393280029,
"reward_std": 0.21339192986488342,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.7789839506149292,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8220993876457214,
"step": 142
},
{
"adv/mean_abs_final_conf": 0.6933019161224365,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7632689476013184,
"adv/std_final_conf": 0.9011903405189514,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348337650299072,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.8008998875140607,
"calib/avg_num_step_conf": 5.24609375,
"calib/ece": 0.1389121338912134,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.3598326359832636,
"calib/gap": 0.4012788245219348,
"calib/mean_conf": 0.5570711297071129,
"calib/mu_c": 0.7451181102362204,
"calib/mu_w": 0.3438392857142856,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.08230125523012555,
"calib/std_conf": 0.3815468943622376,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.4791177514792899,
"calib/step_q_c_n": 676.0,
"calib/step_q_gap": 0.1619513346876857,
"calib/step_q_w": 0.31716641679160423,
"calib/step_q_w_n": 667.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2932.0,
"completions/max_terminated_length": 2932.0,
"completions/mean_length": 535.3671875,
"completions/mean_terminated_length": 535.3671875,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.025781175121665,
"kl": 0.1120758056640625,
"learning_rate": 1.5833333333333333e-06,
"loss": -0.1834,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.030763279646635056,
"mask/share_reasoning": 0.8582282662391663,
"mask/share_step_conf": 0.11100847274065018,
"num_tokens": 33955084.0,
"reward": 0.7788114547729492,
"reward_std": 0.20533671975135803,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.7449921369552612,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.8126306533813477,
"step": 143
},
{
"adv/mean_abs_final_conf": 0.7452036142349243,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7641355991363525,
"adv/std_final_conf": 0.9165805578231812,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349513649940491,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6835314091680814,
"calib/avg_num_step_conf": 5.28125,
"calib/ece": 0.19552400000000003,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.42,
"calib/gap": 0.2959222410865874,
"calib/mean_conf": 0.595356,
"calib/mu_c": 0.7078064516129032,
"calib/mu_w": 0.41188421052631585,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.08544000000000002,
"calib/std_conf": 0.38758931520876583,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4788480453972257,
"calib/step_q_c_n": 793.0,
"calib/step_q_gap": 0.10707702571922922,
"calib/step_q_w": 0.3717710196779965,
"calib/step_q_w_n": 559.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2199.0,
"completions/max_terminated_length": 2199.0,
"completions/mean_length": 497.19140625,
"completions/mean_terminated_length": 497.19140625,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.1536,
"grad_norm": 0.025835543870925903,
"kl": 0.1396636962890625,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0015,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.036746665835380554,
"mask/share_reasoning": 0.8494962453842163,
"mask/share_step_conf": 0.11375711113214493,
"num_tokens": 34186493.0,
"reward": 0.7822964191436768,
"reward_std": 0.19108529388904572,
"rewards/accuracy_reward_step": 0.60546875,
"rewards/final_brier_reward_step": 0.7314571738243103,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8331356048583984,
"step": 144
},
{
"adv/mean_abs_final_conf": 0.7166381478309631,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7478351593017578,
"adv/std_final_conf": 0.8948644399642944,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356358051300049,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7113007852509389,
"calib/avg_num_step_conf": 5.5703125,
"calib/ece": 0.21483739837398375,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.3617886178861789,
"calib/gap": 0.24325776715602587,
"calib/mean_conf": 0.6160569105691057,
"calib/mu_c": 0.7159310344827586,
"calib/mu_w": 0.47267326732673276,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.12073170731707318,
"calib/std_conf": 0.3594817588871085,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.49145888594164466,
"calib/step_q_c_n": 754.0,
"calib/step_q_gap": 0.12235174308450175,
"calib/step_q_w": 0.3691071428571429,
"calib/step_q_w_n": 672.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2765.0,
"completions/max_terminated_length": 2765.0,
"completions/mean_length": 483.09765625,
"completions/mean_terminated_length": 484.9921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.020173804834485054,
"kl": 0.1165313720703125,
"learning_rate": 1.527777777777778e-06,
"loss": -0.0922,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.03564931079745293,
"mask/share_reasoning": 0.8389637470245361,
"mask/share_step_conf": 0.12148070335388184,
"num_tokens": 34412870.0,
"reward": 0.7610074281692505,
"reward_std": 0.2144685834646225,
"rewards/accuracy_reward_step": 0.57421875,
"rewards/final_brier_reward_step": 0.7092058658599854,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8128089308738708,
"step": 145
},
{
"adv/mean_abs_final_conf": 0.7340985536575317,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.754753053188324,
"adv/std_final_conf": 0.9043131470680237,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353973269462585,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.747416561932691,
"calib/avg_num_step_conf": 5.1328125,
"calib/ece": 0.20726720647773283,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.32793522267206476,
"calib/gap": 0.3250254852674206,
"calib/mean_conf": 0.5417611336032389,
"calib/mu_c": 0.7444086021505375,
"calib/mu_w": 0.4193831168831169,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.18625506072874498,
"calib/std_conf": 0.38723643985522777,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.4945991561181435,
"calib/step_q_c_n": 474.0,
"calib/step_q_gap": 0.11234915611814345,
"calib/step_q_w": 0.38225000000000003,
"calib/step_q_w_n": 840.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2600.0,
"completions/max_terminated_length": 2600.0,
"completions/mean_length": 534.15234375,
"completions/mean_terminated_length": 534.15234375,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.0326496884226799,
"kl": 0.1174163818359375,
"learning_rate": 1.5e-06,
"loss": -0.0651,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.032749567180871964,
"mask/share_reasoning": 0.8582661151885986,
"mask/share_step_conf": 0.10898430645465851,
"num_tokens": 34656829.0,
"reward": 0.747028112411499,
"reward_std": 0.22130638360977173,
"rewards/accuracy_reward_step": 0.36328125,
"rewards/final_brier_reward_step": 0.700760543346405,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.7932957410812378,
"step": 146
},
{
"adv/mean_abs_final_conf": 0.6888132095336914,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7391669750213623,
"adv/std_final_conf": 0.8701976537704468,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350495934486389,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7470329971873498,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.24528925619834713,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.5,
"calib/gap": 0.32730054194964686,
"calib/mean_conf": 0.6731404958677685,
"calib/mu_c": 0.8476106194690267,
"calib/mu_w": 0.5203100775193799,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.22574380165289257,
"calib/std_conf": 0.37821685383569825,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5680240549828179,
"calib/step_q_c_n": 582.0,
"calib/step_q_gap": 0.17246262829853798,
"calib/step_q_w": 0.39556142668427996,
"calib/step_q_w_n": 757.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2614.0,
"completions/max_terminated_length": 2614.0,
"completions/mean_length": 513.5078125,
"completions/mean_terminated_length": 513.5078125,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.1568,
"grad_norm": 0.027548471465706825,
"kl": 0.1119842529296875,
"learning_rate": 1.4722222222222225e-06,
"loss": -0.1514,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.033459752798080444,
"mask/share_reasoning": 0.8553204536437988,
"mask/share_step_conf": 0.11121977865695953,
"num_tokens": 34891967.0,
"reward": 0.741882860660553,
"reward_std": 0.23327764868736267,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.6886242032051086,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.7951413989067078,
"step": 147
},
{
"adv/mean_abs_final_conf": 0.6712465286254883,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7319573760032654,
"adv/std_final_conf": 0.8693785667419434,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935432493686676,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.8044133771929826,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.16645161290322585,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.953125,
"calib/frac_conf_gt_0.9": 0.49193548387096775,
"calib/gap": 0.3975328947368423,
"calib/mean_conf": 0.6567741935483871,
"calib/mu_c": 0.8106578947368422,
"calib/mu_w": 0.41312499999999996,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.10516129032258069,
"calib/std_conf": 0.38179235275781737,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5186184210526316,
"calib/step_q_c_n": 760.0,
"calib/step_q_gap": 0.15973949728581543,
"calib/step_q_w": 0.3588789237668161,
"calib/step_q_w_n": 669.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3062.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 531.484375,
"completions/mean_terminated_length": 531.484375,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.02297389879822731,
"kl": 0.119415283203125,
"learning_rate": 1.4444444444444445e-06,
"loss": -0.0599,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03818276524543762,
"mask/share_reasoning": 0.8397675156593323,
"mask/share_step_conf": 0.1220497414469719,
"num_tokens": 35133139.0,
"reward": 0.7869892120361328,
"reward_std": 0.2096644788980484,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.7629504203796387,
"rewards/format_reward_step": 0.953125,
"rewards/step_l2_reward": 0.8110280632972717,
"step": 148
},
{
"adv/mean_abs_final_conf": 0.7334018349647522,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7355673313140869,
"adv/std_final_conf": 0.9075469970703125,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9344674944877625,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8240863347457628,
"calib/avg_num_step_conf": 5.66796875,
"calib/ece": 0.14654471544715447,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.3902439024390244,
"calib/gap": 0.44016419491525427,
"calib/mean_conf": 0.5920731707317073,
"calib/mu_c": 0.8211016949152542,
"calib/mu_w": 0.38093749999999993,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.1294715447154472,
"calib/std_conf": 0.3841612269667263,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5430490018148821,
"calib/step_q_c_n": 551.0,
"calib/step_q_gap": 0.18139160470698446,
"calib/step_q_w": 0.3616573971078977,
"calib/step_q_w_n": 899.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3061.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 569.98828125,
"completions/mean_terminated_length": 572.2235717773438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.02119380608201027,
"kl": 0.10820770263671875,
"learning_rate": 1.4166666666666667e-06,
"loss": -0.0682,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.032900311052799225,
"mask/share_reasoning": 0.8550213575363159,
"mask/share_step_conf": 0.10817211866378784,
"num_tokens": 35383512.0,
"reward": 0.8044848442077637,
"reward_std": 0.22013668715953827,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.7762886881828308,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8326810002326965,
"step": 149
},
{
"adv/mean_abs_final_conf": 0.6614770889282227,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7001590728759766,
"adv/std_final_conf": 0.8694519996643066,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9205872416496277,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7167489373371726,
"calib/avg_num_step_conf": 4.5546875,
"calib/ece": 0.19620408163265302,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.5346938775510204,
"calib/gap": 0.28749554367201435,
"calib/mean_conf": 0.7184897959183673,
"calib/mu_c": 0.8381818181818183,
"calib/mu_w": 0.5506862745098039,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.95703125,
"calib/pce": 0.1655102040816326,
"calib/std_conf": 0.35515182855287136,
"calib/step_conf_rate": 0.95703125,
"calib/step_q_c": 0.5661550888529887,
"calib/step_q_c_n": 619.0,
"calib/step_q_gap": 0.1576541747762063,
"calib/step_q_w": 0.4085009140767824,
"calib/step_q_w_n": 547.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2376.0,
"completions/max_terminated_length": 2376.0,
"completions/mean_length": 436.5625,
"completions/mean_terminated_length": 438.2745361328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.16,
"grad_norm": 0.032520148903131485,
"kl": 0.1268768310546875,
"learning_rate": 1.3888888888888892e-06,
"loss": -0.0493,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.040418535470962524,
"mask/share_reasoning": 0.8314287066459656,
"mask/share_step_conf": 0.12424648553133011,
"num_tokens": 35600232.0,
"reward": 0.7510082125663757,
"reward_std": 0.2066630870103836,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7011894583702087,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8008269667625427,
"step": 150
},
{
"adv/mean_abs_final_conf": 0.7541616559028625,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7513711452484131,
"adv/std_final_conf": 0.9041014909744263,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351353645324707,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.7779142877791428,
"calib/avg_num_step_conf": 4.84765625,
"calib/ece": 0.17130252100840337,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.3067226890756303,
"calib/gap": 0.362661704126617,
"calib/mean_conf": 0.55609243697479,
"calib/mu_c": 0.7648514851485149,
"calib/mu_w": 0.40218978102189784,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.94921875,
"calib/nonempty_step_conf_rate": 0.9375,
"calib/pce": 0.1515126050420168,
"calib/std_conf": 0.3741661090607905,
"calib/step_conf_rate": 0.9375,
"calib/step_q_c": 0.5342436974789916,
"calib/step_q_c_n": 476.0,
"calib/step_q_gap": 0.13857049486461248,
"calib/step_q_w": 0.3956732026143791,
"calib/step_q_w_n": 765.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2822.0,
"completions/max_terminated_length": 2822.0,
"completions/mean_length": 535.37109375,
"completions/mean_terminated_length": 535.37109375,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.02657642960548401,
"kl": 0.106231689453125,
"learning_rate": 1.3611111111111112e-06,
"loss": -0.1509,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.031711041927337646,
"mask/share_reasoning": 0.8681683540344238,
"mask/share_step_conf": 0.1001206785440445,
"num_tokens": 35844311.0,
"reward": 0.7364934682846069,
"reward_std": 0.2154543697834015,
"rewards/accuracy_reward_step": 0.39453125,
"rewards/final_brier_reward_step": 0.7103277444839478,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": 0.7626591920852661,
"step": 151
},
{
"adv/mean_abs_final_conf": 0.799831748008728,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7398291230201721,
"adv/std_final_conf": 0.9359056353569031,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353933930397034,
"calib/answer_extract_rate": 0.9140625,
"calib/auroc": 0.7272593247825447,
"calib/avg_num_step_conf": 5.33203125,
"calib/ece": 0.20055276595744687,
"calib/final_conf_rate": 0.91796875,
"calib/format_rate": 0.89453125,
"calib/frac_conf_gt_0.9": 0.3148936170212766,
"calib/gap": 0.3046464175143743,
"calib/mean_conf": 0.567191914893617,
"calib/mu_c": 0.7396088235294119,
"calib/mu_w": 0.4349624060150376,
"calib/nonempty_final_conf_rate": 0.91796875,
"calib/nonempty_reasoning_rate": 0.953125,
"calib/nonempty_step_conf_rate": 0.94140625,
"calib/pce": 0.16685106382978726,
"calib/std_conf": 0.36984893382234413,
"calib/step_conf_rate": 0.94140625,
"calib/step_q_c": 0.4948689138576779,
"calib/step_q_c_n": 534.0,
"calib/step_q_gap": 0.14458010519341802,
"calib/step_q_w": 0.3502888086642599,
"calib/step_q_w_n": 831.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2550.0,
"completions/max_terminated_length": 2550.0,
"completions/mean_length": 506.484375,
"completions/mean_terminated_length": 510.4724426269531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.033719953149557114,
"kl": 0.1153106689453125,
"learning_rate": 1.3333333333333334e-06,
"loss": -0.1198,
"mask/has_final_conf_rate": 0.91796875,
"mask/share_final_conf": 0.03530751168727875,
"mask/share_reasoning": 0.8421989679336548,
"mask/share_step_conf": 0.11468097567558289,
"num_tokens": 36079363.0,
"reward": 0.7214515209197998,
"reward_std": 0.24063536524772644,
"rewards/accuracy_reward_step": 0.3984375,
"rewards/final_brier_reward_step": 0.6684511303901672,
"rewards/format_reward_step": 0.89453125,
"rewards/step_l2_reward": 0.7744519114494324,
"step": 152
},
{
"adv/mean_abs_final_conf": 0.7516984939575195,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7617511749267578,
"adv/std_final_conf": 0.9103282690048218,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350059032440186,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7148971466489715,
"calib/avg_num_step_conf": 4.71875,
"calib/ece": 0.18222672064777334,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.2631578947368421,
"calib/gap": 0.28025746516257455,
"calib/mean_conf": 0.535991902834008,
"calib/mu_c": 0.6608029197080292,
"calib/mu_w": 0.38054545454545463,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.08178137651821868,
"calib/std_conf": 0.37345054783107035,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.49373608903020666,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.11404696985922219,
"calib/step_q_w": 0.37968911917098447,
"calib/step_q_w_n": 579.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3003.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 495.65234375,
"completions/mean_terminated_length": 497.5960998535156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.1632,
"grad_norm": 0.03105360083281994,
"kl": 0.11749267578125,
"learning_rate": 1.3055555555555556e-06,
"loss": -0.0638,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03299054503440857,
"mask/share_reasoning": 0.8585386872291565,
"mask/share_step_conf": 0.10456451773643494,
"num_tokens": 36313570.0,
"reward": 0.7574142217636108,
"reward_std": 0.210285484790802,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7134543061256409,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8013741970062256,
"step": 153
},
{
"adv/mean_abs_final_conf": 0.7263320684432983,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7746478319168091,
"adv/std_final_conf": 0.8918282985687256,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348356127738953,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6936760164693774,
"calib/avg_num_step_conf": 4.64453125,
"calib/ece": 0.2270988,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.336,
"calib/gap": 0.2798690941842511,
"calib/mean_conf": 0.5604212,
"calib/mu_c": 0.7104310344827586,
"calib/mu_w": 0.4305619402985075,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.16175999999999996,
"calib/std_conf": 0.3910226783072307,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5556363636363637,
"calib/step_q_c_n": 495.0,
"calib/step_q_gap": 0.181702646057113,
"calib/step_q_w": 0.3739337175792507,
"calib/step_q_w_n": 694.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1656.0,
"completions/max_terminated_length": 1656.0,
"completions/mean_length": 455.3203125,
"completions/mean_terminated_length": 458.905517578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.030806738883256912,
"kl": 0.1298980712890625,
"learning_rate": 1.2777777777777779e-06,
"loss": -0.0773,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.035829126834869385,
"mask/share_reasoning": 0.8452428579330444,
"mask/share_step_conf": 0.11111550778150558,
"num_tokens": 36534572.0,
"reward": 0.7740947008132935,
"reward_std": 0.17938129603862762,
"rewards/accuracy_reward_step": 0.453125,
"rewards/final_brier_reward_step": 0.7112395763397217,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8369499444961548,
"step": 154
},
{
"adv/mean_abs_final_conf": 0.7826955318450928,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7361176013946533,
"adv/std_final_conf": 0.935632050037384,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9358672499656677,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.6956580640791166,
"calib/avg_num_step_conf": 5.359375,
"calib/ece": 0.22495901639344262,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.26639344262295084,
"calib/gap": 0.26259229153965985,
"calib/mean_conf": 0.5129918032786884,
"calib/mu_c": 0.656126126126126,
"calib/mu_w": 0.3935338345864661,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.14151639344262296,
"calib/std_conf": 0.37540860789762587,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.48486868686868684,
"calib/step_q_c_n": 495.0,
"calib/step_q_gap": 0.15307849302604143,
"calib/step_q_w": 0.3317901938426454,
"calib/step_q_w_n": 877.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 453.94140625,
"completions/mean_terminated_length": 457.5157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.02824331633746624,
"kl": 0.1313629150390625,
"learning_rate": 1.25e-06,
"loss": -0.0847,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03731415420770645,
"mask/share_reasoning": 0.8393542766571045,
"mask/share_step_conf": 0.11551909893751144,
"num_tokens": 36757997.0,
"reward": 0.7349332571029663,
"reward_std": 0.2547478675842285,
"rewards/accuracy_reward_step": 0.4453125,
"rewards/final_brier_reward_step": 0.6926234364509583,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.7772430181503296,
"step": 155
},
{
"adv/mean_abs_final_conf": 0.7364197373390198,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7438960075378418,
"adv/std_final_conf": 0.9156517386436462,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347387552261353,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7281936030226045,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.1750806451612903,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.29838709677419356,
"calib/gap": 0.33133737215816556,
"calib/mean_conf": 0.5179032258064517,
"calib/mu_c": 0.6902521008403361,
"calib/mu_w": 0.3589147286821705,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.10657258064516124,
"calib/std_conf": 0.38794282542153785,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.4847897897897898,
"calib/step_q_c_n": 666.0,
"calib/step_q_gap": 0.129165800571461,
"calib/step_q_w": 0.35562398921832883,
"calib/step_q_w_n": 742.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2671.0,
"completions/max_terminated_length": 2671.0,
"completions/mean_length": 520.078125,
"completions/mean_terminated_length": 520.078125,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.1664,
"grad_norm": 0.02229756861925125,
"kl": 0.1176910400390625,
"learning_rate": 1.2222222222222223e-06,
"loss": -0.0485,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03484189510345459,
"mask/share_reasoning": 0.8441666960716248,
"mask/share_step_conf": 0.12099139392375946,
"num_tokens": 36995897.0,
"reward": 0.7709934711456299,
"reward_std": 0.21262159943580627,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.7337499856948853,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8082370162010193,
"step": 156
},
{
"adv/mean_abs_final_conf": 0.6937302350997925,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7066144943237305,
"adv/std_final_conf": 0.8964200019836426,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352517127990723,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.794429525580114,
"calib/avg_num_step_conf": 5.94140625,
"calib/ece": 0.13188524590163936,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.38934426229508196,
"calib/gap": 0.4263003511671142,
"calib/mean_conf": 0.5777049180327868,
"calib/mu_c": 0.7576595744680851,
"calib/mu_w": 0.3313592233009709,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.06586065573770493,
"calib/std_conf": 0.3969138876861177,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5136559036144578,
"calib/step_q_c_n": 830.0,
"calib/step_q_gap": 0.16106545498927688,
"calib/step_q_w": 0.3525904486251809,
"calib/step_q_w_n": 691.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2595.0,
"completions/max_terminated_length": 2595.0,
"completions/mean_length": 513.89453125,
"completions/mean_terminated_length": 517.94091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.029141323640942574,
"kl": 0.1168365478515625,
"learning_rate": 1.1944444444444446e-06,
"loss": -0.0538,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.034257110208272934,
"mask/share_reasoning": 0.8325086832046509,
"mask/share_step_conf": 0.1254216879606247,
"num_tokens": 37231182.0,
"reward": 0.7845574617385864,
"reward_std": 0.19589388370513916,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7629241943359375,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8061907291412354,
"step": 157
},
{
"adv/mean_abs_final_conf": 0.7426419258117676,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7533231973648071,
"adv/std_final_conf": 0.9092576503753662,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353457093238831,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.7546258503401361,
"calib/avg_num_step_conf": 4.875,
"calib/ece": 0.1671428571428572,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.3673469387755102,
"calib/gap": 0.3399285714285714,
"calib/mean_conf": 0.6068163265306123,
"calib/mu_c": 0.7525,
"calib/mu_w": 0.41257142857142853,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.10126530612244904,
"calib/std_conf": 0.3710267977267997,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5364376996805111,
"calib/step_q_c_n": 626.0,
"calib/step_q_gap": 0.14506953247793875,
"calib/step_q_w": 0.3913681672025724,
"calib/step_q_w_n": 622.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2040.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 458.75390625,
"completions/mean_terminated_length": 462.36614990234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.028471689671278,
"kl": 0.1329345703125,
"learning_rate": 1.1666666666666668e-06,
"loss": -0.0743,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03755185008049011,
"mask/share_reasoning": 0.8372524380683899,
"mask/share_step_conf": 0.11738321185112,
"num_tokens": 37453863.0,
"reward": 0.7625678777694702,
"reward_std": 0.23472969233989716,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7292590141296387,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7958768606185913,
"step": 158
},
{
"adv/mean_abs_final_conf": 0.7335477471351624,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7329621911048889,
"adv/std_final_conf": 0.9072332978248596,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348861575126648,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.6955052493438321,
"calib/avg_num_step_conf": 5.1875,
"calib/ece": 0.22028340080971667,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.3522267206477733,
"calib/gap": 0.24983136482939633,
"calib/mean_conf": 0.5543724696356276,
"calib/mu_c": 0.675748031496063,
"calib/mu_w": 0.42591666666666667,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.13024291497975715,
"calib/std_conf": 0.38340303279691007,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5337541528239202,
"calib/step_q_c_n": 602.0,
"calib/step_q_gap": 0.14590291315449871,
"calib/step_q_w": 0.38785123966942153,
"calib/step_q_w_n": 726.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1461.0,
"completions/max_terminated_length": 1461.0,
"completions/mean_length": 443.578125,
"completions/mean_terminated_length": 445.31768798828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.1696,
"grad_norm": 0.027787763625383377,
"kl": 0.1327056884765625,
"learning_rate": 1.138888888888889e-06,
"loss": -0.1012,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03695898503065109,
"mask/share_reasoning": 0.8409558534622192,
"mask/share_step_conf": 0.11817888915538788,
"num_tokens": 37672203.0,
"reward": 0.765270471572876,
"reward_std": 0.1936519891023636,
"rewards/accuracy_reward_step": 0.50390625,
"rewards/final_brier_reward_step": 0.6986597776412964,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8318811058998108,
"step": 159
},
{
"adv/mean_abs_final_conf": 0.7273938655853271,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7440230250358582,
"adv/std_final_conf": 0.9046057462692261,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353306293487549,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7467687413004575,
"calib/avg_num_step_conf": 4.734375,
"calib/ece": 0.18802419354838706,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.3467741935483871,
"calib/gap": 0.32727580035792403,
"calib/mean_conf": 0.5456048387096775,
"calib/mu_c": 0.6868085106382978,
"calib/mu_w": 0.3595327102803738,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.0825403225806451,
"calib/std_conf": 0.39425874340435424,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5295440729483283,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.13961627511439328,
"calib/step_q_w": 0.389927797833935,
"calib/step_q_w_n": 554.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2503.0,
"completions/max_terminated_length": 2503.0,
"completions/mean_length": 490.8359375,
"completions/mean_terminated_length": 490.8359375,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.026033377274870872,
"kl": 0.1306304931640625,
"learning_rate": 1.111111111111111e-06,
"loss": -0.071,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.0354008786380291,
"mask/share_reasoning": 0.8589190244674683,
"mask/share_step_conf": 0.10568006336688995,
"num_tokens": 37902697.0,
"reward": 0.7788609266281128,
"reward_std": 0.20089104771614075,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7316604852676392,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8260613083839417,
"step": 160
},
{
"adv/mean_abs_final_conf": 0.6831102967262268,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7423896789550781,
"adv/std_final_conf": 0.890784740447998,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347586035728455,
"calib/answer_extract_rate": 0.9921875,
"calib/auroc": 0.6956490929705216,
"calib/avg_num_step_conf": 5.01953125,
"calib/ece": 0.2265476190476191,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.9765625,
"calib/frac_conf_gt_0.9": 0.38095238095238093,
"calib/gap": 0.27446428571428566,
"calib/mean_conf": 0.5822619047619049,
"calib/mu_c": 0.67375,
"calib/mu_w": 0.3992857142857143,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.07107142857142865,
"calib/std_conf": 0.3908162536337356,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.522169373549884,
"calib/step_q_c_n": 862.0,
"calib/step_q_gap": 0.12871783690685806,
"calib/step_q_w": 0.39345153664302596,
"calib/step_q_w_n": 423.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1364.0,
"completions/max_terminated_length": 1364.0,
"completions/mean_length": 448.42578125,
"completions/mean_terminated_length": 450.184326171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.03522691875696182,
"kl": 0.1334075927734375,
"learning_rate": 1.0833333333333335e-06,
"loss": -0.0441,
"mask/has_final_conf_rate": 0.98046875,
"mask/share_final_conf": 0.037295594811439514,
"mask/share_reasoning": 0.838032603263855,
"mask/share_step_conf": 0.12076550722122192,
"num_tokens": 38121414.0,
"reward": 0.7708685994148254,
"reward_std": 0.18391358852386475,
"rewards/accuracy_reward_step": 0.66015625,
"rewards/final_brier_reward_step": 0.720809817314148,
"rewards/format_reward_step": 0.9765625,
"rewards/step_l2_reward": 0.8209274411201477,
"step": 161
},
{
"adv/mean_abs_final_conf": 0.6881465315818787,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7457125782966614,
"adv/std_final_conf": 0.8926540017127991,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345178008079529,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7456922162804516,
"calib/avg_num_step_conf": 4.7578125,
"calib/ece": 0.1726190476190475,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.48412698412698413,
"calib/gap": 0.3740463458110516,
"calib/mean_conf": 0.6173015873015874,
"calib/mu_c": 0.7642483660130719,
"calib/mu_w": 0.39020202020202027,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.09138888888888877,
"calib/std_conf": 0.39745502430675955,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5439555555555555,
"calib/step_q_c_n": 675.0,
"calib/step_q_gap": 0.13649699201964394,
"calib/step_q_w": 0.4074585635359116,
"calib/step_q_w_n": 543.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1497.0,
"completions/max_terminated_length": 1497.0,
"completions/mean_length": 433.66796875,
"completions/mean_terminated_length": 435.36865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.1728,
"grad_norm": 0.036304570734500885,
"kl": 0.137939453125,
"learning_rate": 1.0555555555555557e-06,
"loss": -0.0579,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03735470026731491,
"mask/share_reasoning": 0.8399271965026855,
"mask/share_step_conf": 0.11881189048290253,
"num_tokens": 38336577.0,
"reward": 0.7964285016059875,
"reward_std": 0.19373297691345215,
"rewards/accuracy_reward_step": 0.6015625,
"rewards/final_brier_reward_step": 0.7544523477554321,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.838404655456543,
"step": 162
},
{
"adv/mean_abs_final_conf": 0.7097820043563843,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7364397644996643,
"adv/std_final_conf": 0.8892258405685425,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9342389106750488,
"calib/answer_extract_rate": 0.9609375,
"calib/auroc": 0.8013877715205149,
"calib/avg_num_step_conf": 5.5,
"calib/ece": 0.16159183673469385,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.3795918367346939,
"calib/gap": 0.3858004827031376,
"calib/mean_conf": 0.563877551020408,
"calib/mu_c": 0.7418181818181818,
"calib/mu_w": 0.3560176991150442,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.09334693877551016,
"calib/std_conf": 0.3870960241917022,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5473961661341853,
"calib/step_q_c_n": 626.0,
"calib/step_q_gap": 0.2016416904308605,
"calib/step_q_w": 0.3457544757033248,
"calib/step_q_w_n": 782.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 540.84765625,
"completions/mean_terminated_length": 540.84765625,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.02911502867937088,
"kl": 0.1142425537109375,
"learning_rate": 1.0277777777777777e-06,
"loss": -0.1082,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.034962691366672516,
"mask/share_reasoning": 0.8461117148399353,
"mask/share_step_conf": 0.11892561614513397,
"num_tokens": 38579866.0,
"reward": 0.790389895439148,
"reward_std": 0.1950080692768097,
"rewards/accuracy_reward_step": 0.51953125,
"rewards/final_brier_reward_step": 0.7531589269638062,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8276206851005554,
"step": 163
},
{
"adv/mean_abs_final_conf": 0.7328827381134033,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7585192918777466,
"adv/std_final_conf": 0.9071486592292786,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9354165196418762,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.7537022321751065,
"calib/avg_num_step_conf": 5.55078125,
"calib/ece": 0.17059322033898316,
"calib/final_conf_rate": 0.921875,
"calib/format_rate": 0.91796875,
"calib/frac_conf_gt_0.9": 0.3644067796610169,
"calib/gap": 0.3148306003034024,
"calib/mean_conf": 0.5703389830508475,
"calib/mu_c": 0.715748031496063,
"calib/mu_w": 0.40091743119266066,
"calib/nonempty_final_conf_rate": 0.921875,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.10139830508474584,
"calib/std_conf": 0.3783952113041983,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5105730659025788,
"calib/step_q_c_n": 698.0,
"calib/step_q_gap": 0.13347763021793152,
"calib/step_q_w": 0.37709543568464726,
"calib/step_q_w_n": 723.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2937.0,
"completions/max_terminated_length": 2937.0,
"completions/mean_length": 588.8046875,
"completions/mean_terminated_length": 593.44091796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.029559997841715813,
"kl": 0.10688018798828125,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0654,
"mask/has_final_conf_rate": 0.921875,
"mask/share_final_conf": 0.03045329451560974,
"mask/share_reasoning": 0.8584093451499939,
"mask/share_step_conf": 0.10332489013671875,
"num_tokens": 38836736.0,
"reward": 0.7449078559875488,
"reward_std": 0.22561901807785034,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7001925706863403,
"rewards/format_reward_step": 0.91796875,
"rewards/step_l2_reward": 0.7896230816841125,
"step": 164
},
{
"adv/mean_abs_final_conf": 0.7392752170562744,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7389962673187256,
"adv/std_final_conf": 0.9226400852203369,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351508021354675,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7567901234567901,
"calib/avg_num_step_conf": 5.3203125,
"calib/ece": 0.24765432098765444,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.39094650205761317,
"calib/gap": 0.3417407407407406,
"calib/mean_conf": 0.608477366255144,
"calib/mu_c": 0.7983333333333332,
"calib/mu_w": 0.4565925925925926,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.20584362139917706,
"calib/std_conf": 0.3743918889676306,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5377734375,
"calib/step_q_c_n": 512.0,
"calib/step_q_gap": 0.1569616727941176,
"calib/step_q_w": 0.38081176470588235,
"calib/step_q_w_n": 850.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2987.0,
"completions/max_terminated_length": 2987.0,
"completions/mean_length": 546.93359375,
"completions/mean_terminated_length": 549.0784912109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.176,
"grad_norm": 0.034481655806303024,
"kl": 0.11751556396484375,
"learning_rate": 9.722222222222224e-07,
"loss": -0.0787,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03241300582885742,
"mask/share_reasoning": 0.8556787371635437,
"mask/share_step_conf": 0.10800202190876007,
"num_tokens": 39082327.0,
"reward": 0.759535014629364,
"reward_std": 0.20878538489341736,
"rewards/accuracy_reward_step": 0.421875,
"rewards/final_brier_reward_step": 0.712382435798645,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8066876530647278,
"step": 165
},
{
"adv/mean_abs_final_conf": 0.6982669830322266,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7666926980018616,
"adv/std_final_conf": 0.8951435685157776,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345536231994629,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7946651949963208,
"calib/avg_num_step_conf": 5.3515625,
"calib/ece": 0.15782572614107884,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.36929460580912865,
"calib/gap": 0.4003985283296541,
"calib/mean_conf": 0.5773609958506224,
"calib/mu_c": 0.726887417218543,
"calib/mu_w": 0.3264888888888889,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.05431535269709547,
"calib/std_conf": 0.38817360188537814,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5173142112125163,
"calib/step_q_c_n": 767.0,
"calib/step_q_gap": 0.13766246991898395,
"calib/step_q_w": 0.37965174129353235,
"calib/step_q_w_n": 603.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2492.0,
"completions/max_terminated_length": 2492.0,
"completions/mean_length": 536.796875,
"completions/mean_terminated_length": 536.796875,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.028494818136096,
"kl": 0.1080780029296875,
"learning_rate": 9.444444444444445e-07,
"loss": -0.1071,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.032391056418418884,
"mask/share_reasoning": 0.8516812324523926,
"mask/share_step_conf": 0.11592767387628555,
"num_tokens": 39325931.0,
"reward": 0.7896512746810913,
"reward_std": 0.18727052211761475,
"rewards/accuracy_reward_step": 0.59765625,
"rewards/final_brier_reward_step": 0.7533988952636719,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8259036540985107,
"step": 166
},
{
"adv/mean_abs_final_conf": 0.6581146717071533,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.769611656665802,
"adv/std_final_conf": 0.872353196144104,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350212216377258,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.6861284745311877,
"calib/avg_num_step_conf": 5.06640625,
"calib/ece": 0.23337349397590368,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.4939759036144578,
"calib/gap": 0.21856097885357095,
"calib/mean_conf": 0.6993172690763052,
"calib/mu_c": 0.7897260273972603,
"calib/mu_w": 0.5711650485436893,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.17317269076305225,
"calib/std_conf": 0.35386213329557303,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5674607703281026,
"calib/step_q_c_n": 701.0,
"calib/step_q_gap": 0.09635338777776709,
"calib/step_q_w": 0.4711073825503355,
"calib/step_q_w_n": 596.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2426.0,
"completions/max_terminated_length": 2426.0,
"completions/mean_length": 513.32421875,
"completions/mean_terminated_length": 515.3372802734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.024843445047736168,
"kl": 0.114715576171875,
"learning_rate": 9.166666666666666e-07,
"loss": -0.0683,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.033744700253009796,
"mask/share_reasoning": 0.85487961769104,
"mask/share_step_conf": 0.10746943950653076,
"num_tokens": 39562950.0,
"reward": 0.7541587352752686,
"reward_std": 0.18127688765525818,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.7017910480499268,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8065265417098999,
"step": 167
},
{
"adv/mean_abs_final_conf": 0.6994673609733582,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7407394051551819,
"adv/std_final_conf": 0.8693518042564392,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341798424720764,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7639729325265144,
"calib/avg_num_step_conf": 5.609375,
"calib/ece": 0.15360000000000001,
"calib/final_conf_rate": 0.9765625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.36,
"calib/gap": 0.3358071442514153,
"calib/mean_conf": 0.60912,
"calib/mu_c": 0.755531914893617,
"calib/mu_w": 0.41972477064220176,
"calib/nonempty_final_conf_rate": 0.9765625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.09936,
"calib/std_conf": 0.36863969618043035,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5279178470254958,
"calib/step_q_c_n": 706.0,
"calib/step_q_gap": 0.19868497031316706,
"calib/step_q_w": 0.32923287671232876,
"calib/step_q_w_n": 730.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2920.0,
"completions/max_terminated_length": 2920.0,
"completions/mean_length": 555.65625,
"completions/mean_terminated_length": 555.65625,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.1792,
"grad_norm": 0.030230311676859856,
"kl": 0.1105194091796875,
"learning_rate": 8.88888888888889e-07,
"loss": -0.0252,
"mask/has_final_conf_rate": 0.9765625,
"mask/share_final_conf": 0.03307843208312988,
"mask/share_reasoning": 0.8582138419151306,
"mask/share_step_conf": 0.10870774835348129,
"num_tokens": 39809870.0,
"reward": 0.8047250509262085,
"reward_std": 0.187750905752182,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7603687047958374,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.8490815162658691,
"step": 168
},
{
"adv/mean_abs_final_conf": 0.6772477030754089,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7406015396118164,
"adv/std_final_conf": 0.8583061099052429,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9343918561935425,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.7615018688413184,
"calib/avg_num_step_conf": 4.453125,
"calib/ece": 0.15352459016393438,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.42213114754098363,
"calib/gap": 0.353482840638804,
"calib/mean_conf": 0.6317213114754098,
"calib/mu_c": 0.7896296296296297,
"calib/mu_w": 0.4361467889908257,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.96484375,
"calib/nonempty_step_conf_rate": 0.94921875,
"calib/pce": 0.11598360655737702,
"calib/std_conf": 0.36956264054085053,
"calib/step_conf_rate": 0.94921875,
"calib/step_q_c": 0.5921711568938193,
"calib/step_q_c_n": 631.0,
"calib/step_q_gap": 0.1683794083672967,
"calib/step_q_w": 0.42379174852652257,
"calib/step_q_w_n": 509.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2822.0,
"completions/max_terminated_length": 2822.0,
"completions/mean_length": 522.76953125,
"completions/mean_terminated_length": 522.76953125,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.01928914710879326,
"kl": 0.118133544921875,
"learning_rate": 8.611111111111112e-07,
"loss": -0.1741,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.03361359238624573,
"mask/share_reasoning": 0.8643434047698975,
"mask/share_step_conf": 0.1020430251955986,
"num_tokens": 40047883.0,
"reward": 0.7742510437965393,
"reward_std": 0.19515499472618103,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7330683469772339,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8154337406158447,
"step": 169
},
{
"adv/mean_abs_final_conf": 0.685404896736145,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7570817470550537,
"adv/std_final_conf": 0.8591107130050659,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9352391958236694,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.8053858577347096,
"calib/avg_num_step_conf": 5.49609375,
"calib/ece": 0.15433884297520664,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.47107438016528924,
"calib/gap": 0.37613720946562756,
"calib/mean_conf": 0.6625206611570248,
"calib/mu_c": 0.8195035460992909,
"calib/mu_w": 0.4433663366336633,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.11710743801652895,
"calib/std_conf": 0.37280909521354627,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5794684796044499,
"calib/step_q_c_n": 809.0,
"calib/step_q_gap": 0.15316413177836297,
"calib/step_q_w": 0.42630434782608695,
"calib/step_q_w_n": 598.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2616.0,
"completions/max_terminated_length": 2616.0,
"completions/mean_length": 535.41796875,
"completions/mean_terminated_length": 541.766845703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.02909146249294281,
"kl": 0.1026763916015625,
"learning_rate": 8.333333333333333e-07,
"loss": -0.1038,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03135133534669876,
"mask/share_reasoning": 0.8453192114830017,
"mask/share_step_conf": 0.11161071807146072,
"num_tokens": 40289102.0,
"reward": 0.7756690979003906,
"reward_std": 0.20986171066761017,
"rewards/accuracy_reward_step": 0.55078125,
"rewards/final_brier_reward_step": 0.7445504069328308,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8067878484725952,
"step": 170
},
{
"adv/mean_abs_final_conf": 0.7240392565727234,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.750836193561554,
"adv/std_final_conf": 0.8942993879318237,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346383810043335,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7470126208378088,
"calib/avg_num_step_conf": 4.8671875,
"calib/ece": 0.23118367346938784,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.46530612244897956,
"calib/gap": 0.30954417293233083,
"calib/mean_conf": 0.6495510204081633,
"calib/mu_c": 0.8175892857142858,
"calib/mu_w": 0.508045112781955,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.211795918367347,
"calib/std_conf": 0.3764563590299157,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5977127659574468,
"calib/step_q_c_n": 564.0,
"calib/step_q_gap": 0.20283006801023273,
"calib/step_q_w": 0.3948826979472141,
"calib/step_q_w_n": 682.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2778.0,
"completions/max_terminated_length": 2778.0,
"completions/mean_length": 503.66796875,
"completions/mean_terminated_length": 505.6431579589844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.1824,
"grad_norm": 0.028601879253983498,
"kl": 0.11051177978515625,
"learning_rate": 8.055555555555557e-07,
"loss": -0.0867,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.03329313173890114,
"mask/share_reasoning": 0.8577781915664673,
"mask/share_step_conf": 0.1050223857164383,
"num_tokens": 40524937.0,
"reward": 0.743405818939209,
"reward_std": 0.2160402238368988,
"rewards/accuracy_reward_step": 0.44140625,
"rewards/final_brier_reward_step": 0.6792566180229187,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.8075549006462097,
"step": 171
},
{
"adv/mean_abs_final_conf": 0.7263467311859131,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7435145378112793,
"adv/std_final_conf": 0.8930355906486511,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.934866726398468,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.7686213991769547,
"calib/avg_num_step_conf": 5.0390625,
"calib/ece": 0.18440329218106993,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9375,
"calib/frac_conf_gt_0.9": 0.5185185185185185,
"calib/gap": 0.31218518518518523,
"calib/mean_conf": 0.7091769547325102,
"calib/mu_c": 0.847925925925926,
"calib/mu_w": 0.5357407407407407,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.16901234567901235,
"calib/std_conf": 0.3398234556835382,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5687214611872146,
"calib/step_q_c_n": 657.0,
"calib/step_q_gap": 0.13083836482070593,
"calib/step_q_w": 0.43788309636650863,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3033.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 532.25,
"completions/mean_terminated_length": 532.25,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.025736555457115173,
"kl": 0.108001708984375,
"learning_rate": 7.777777777777779e-07,
"loss": -0.0984,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.03396880626678467,
"mask/share_reasoning": 0.8518245220184326,
"mask/share_step_conf": 0.11420667916536331,
"num_tokens": 40764545.0,
"reward": 0.7568353414535522,
"reward_std": 0.2210984230041504,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7214609384536743,
"rewards/format_reward_step": 0.9375,
"rewards/step_l2_reward": 0.7922097444534302,
"step": 172
},
{
"adv/mean_abs_final_conf": 0.7614967823028564,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7911373972892761,
"adv/std_final_conf": 0.908078670501709,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351539611816406,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.660099816128185,
"calib/avg_num_step_conf": 5.58203125,
"calib/ece": 0.2597188755020081,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.96875,
"calib/frac_conf_gt_0.9": 0.5421686746987951,
"calib/gap": 0.16133175728920413,
"calib/mean_conf": 0.7683935742971887,
"calib/mu_c": 0.8383687943262411,
"calib/mu_w": 0.677037037037037,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.23092369477911653,
"calib/std_conf": 0.305166735843764,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.5414319248826291,
"calib/step_q_c_n": 852.0,
"calib/step_q_gap": 0.024690157118331113,
"calib/step_q_w": 0.516741767764298,
"calib/step_q_w_n": 577.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2027.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 509.953125,
"completions/mean_terminated_length": 509.953125,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.029172774404287338,
"kl": 0.11724853515625,
"learning_rate": 7.5e-07,
"loss": -0.0033,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03518332913517952,
"mask/share_reasoning": 0.8441824316978455,
"mask/share_step_conf": 0.12063427269458771,
"num_tokens": 40998253.0,
"reward": 0.7339141368865967,
"reward_std": 0.21232792735099792,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.676971435546875,
"rewards/format_reward_step": 0.96875,
"rewards/step_l2_reward": 0.7908568382263184,
"step": 173
},
{
"adv/mean_abs_final_conf": 0.799006462097168,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7779349684715271,
"adv/std_final_conf": 0.9240788221359253,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9356370568275452,
"calib/answer_extract_rate": 0.9453125,
"calib/auroc": 0.6908530606527525,
"calib/avg_num_step_conf": 5.0546875,
"calib/ece": 0.22924686192468613,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.3891213389121339,
"calib/gap": 0.22902577391791556,
"calib/mean_conf": 0.6314225941422595,
"calib/mu_c": 0.7473728813559322,
"calib/mu_w": 0.5183471074380166,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.96875,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.18347280334728028,
"calib/std_conf": 0.36237776076612777,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5223063063063063,
"calib/step_q_c_n": 555.0,
"calib/step_q_gap": 0.08102078533201668,
"calib/step_q_w": 0.4412855209742896,
"calib/step_q_w_n": 739.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2731.0,
"completions/max_terminated_length": 2731.0,
"completions/mean_length": 548.2109375,
"completions/mean_terminated_length": 550.36083984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.1856,
"grad_norm": 0.026223061606287956,
"kl": 0.1020355224609375,
"learning_rate": 7.222222222222222e-07,
"loss": -0.2093,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.030761104077100754,
"mask/share_reasoning": 0.8606378436088562,
"mask/share_step_conf": 0.10469479858875275,
"num_tokens": 41242827.0,
"reward": 0.7080938816070557,
"reward_std": 0.25465157628059387,
"rewards/accuracy_reward_step": 0.46484375,
"rewards/final_brier_reward_step": 0.6587077975273132,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7574799060821533,
"step": 174
},
{
"adv/mean_abs_final_conf": 0.6954188942909241,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.734821617603302,
"adv/std_final_conf": 0.877416729927063,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350839257240295,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.7567567567567569,
"calib/avg_num_step_conf": 5.4375,
"calib/ece": 0.2215833333333332,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.93359375,
"calib/frac_conf_gt_0.9": 0.375,
"calib/gap": 0.30361198407710027,
"calib/mean_conf": 0.5615833333333333,
"calib/mu_c": 0.7247747747747747,
"calib/mu_w": 0.4211627906976744,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.95703125,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.16033333333333322,
"calib/std_conf": 0.38855286176901194,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5498676748582231,
"calib/step_q_c_n": 529.0,
"calib/step_q_gap": 0.16305423337502495,
"calib/step_q_w": 0.38681344148319813,
"calib/step_q_w_n": 863.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2585.0,
"completions/max_terminated_length": 2585.0,
"completions/mean_length": 532.87109375,
"completions/mean_terminated_length": 539.1897583007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.02312024123966694,
"kl": 0.10526275634765625,
"learning_rate": 6.944444444444446e-07,
"loss": -0.182,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.03259507194161415,
"mask/share_reasoning": 0.8478891253471375,
"mask/share_step_conf": 0.10779708623886108,
"num_tokens": 41485066.0,
"reward": 0.7382111549377441,
"reward_std": 0.21607854962348938,
"rewards/accuracy_reward_step": 0.43359375,
"rewards/final_brier_reward_step": 0.691319465637207,
"rewards/format_reward_step": 0.93359375,
"rewards/step_l2_reward": 0.7851028442382812,
"step": 175
},
{
"adv/mean_abs_final_conf": 0.6969413757324219,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7284245491027832,
"adv/std_final_conf": 0.8782188892364502,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353442788124084,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7745104895104895,
"calib/avg_num_step_conf": 4.91796875,
"calib/ece": 0.20312500000000006,
"calib/final_conf_rate": 0.9375,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.4791666666666667,
"calib/gap": 0.3726923076923079,
"calib/mean_conf": 0.676875,
"calib/mu_c": 0.8476923076923079,
"calib/mu_w": 0.475,
"calib/nonempty_final_conf_rate": 0.9375,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.16916666666666672,
"calib/std_conf": 0.3678584207386496,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5631003039513678,
"calib/step_q_c_n": 658.0,
"calib/step_q_gap": 0.17672759180494507,
"calib/step_q_w": 0.3863727121464227,
"calib/step_q_w_n": 601.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2874.0,
"completions/max_terminated_length": 2874.0,
"completions/mean_length": 545.77734375,
"completions/mean_terminated_length": 545.77734375,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.024735065177083015,
"kl": 0.10872650146484375,
"learning_rate": 6.666666666666667e-07,
"loss": -0.1375,
"mask/has_final_conf_rate": 0.9375,
"mask/share_final_conf": 0.03495047986507416,
"mask/share_reasoning": 0.85334712266922,
"mask/share_step_conf": 0.11170239001512527,
"num_tokens": 41728849.0,
"reward": 0.7591984868049622,
"reward_std": 0.2403138130903244,
"rewards/accuracy_reward_step": 0.515625,
"rewards/final_brier_reward_step": 0.7225527167320251,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7958441972732544,
"step": 176
},
{
"adv/mean_abs_final_conf": 0.7366968393325806,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7311983108520508,
"adv/std_final_conf": 0.9075549244880676,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350935816764832,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.753664111873067,
"calib/avg_num_step_conf": 5.3359375,
"calib/ece": 0.15971428571428573,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.4204081632653061,
"calib/gap": 0.34068441575904246,
"calib/mean_conf": 0.6166938775510203,
"calib/mu_c": 0.7710447761194029,
"calib/mu_w": 0.43036036036036046,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.11473469387755103,
"calib/std_conf": 0.37850523432509514,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5550551181102362,
"calib/step_q_c_n": 635.0,
"calib/step_q_gap": 0.17452160237836206,
"calib/step_q_w": 0.38053351573187416,
"calib/step_q_w_n": 731.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3025.0,
"completions/max_terminated_length": 3025.0,
"completions/mean_length": 519.1953125,
"completions/mean_terminated_length": 523.283447265625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.1888,
"grad_norm": 0.0299546979367733,
"kl": 0.10813140869140625,
"learning_rate": 6.388888888888889e-07,
"loss": -0.1086,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.033860474824905396,
"mask/share_reasoning": 0.8479142189025879,
"mask/share_step_conf": 0.11041281372308731,
"num_tokens": 41965595.0,
"reward": 0.7677517533302307,
"reward_std": 0.2208181917667389,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.7314074039459229,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.8040961623191833,
"step": 177
},
{
"adv/mean_abs_final_conf": 0.7266371250152588,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7638938426971436,
"adv/std_final_conf": 0.8988336324691772,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.93440842628479,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.7904185742315238,
"calib/avg_num_step_conf": 5.01171875,
"calib/ece": 0.15554216867469886,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3654618473895582,
"calib/gap": 0.35714061478090253,
"calib/mean_conf": 0.621004016064257,
"calib/mu_c": 0.7787769784172662,
"calib/mu_w": 0.4216363636363637,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.10915662650602416,
"calib/std_conf": 0.3606478385318377,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.587271268057785,
"calib/step_q_c_n": 623.0,
"calib/step_q_gap": 0.21184702563354246,
"calib/step_q_w": 0.3754242424242425,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2608.0,
"completions/max_terminated_length": 2608.0,
"completions/mean_length": 472.04296875,
"completions/mean_terminated_length": 477.6403503417969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.02804921194911003,
"kl": 0.11296844482421875,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0177,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.035195671021938324,
"mask/share_reasoning": 0.8432610630989075,
"mask/share_step_conf": 0.10982454568147659,
"num_tokens": 42192510.0,
"reward": 0.807304859161377,
"reward_std": 0.19166387617588043,
"rewards/accuracy_reward_step": 0.54296875,
"rewards/final_brier_reward_step": 0.7737792730331421,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.840830385684967,
"step": 178
},
{
"adv/mean_abs_final_conf": 0.7112410068511963,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7365227937698364,
"adv/std_final_conf": 0.9087821245193481,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9346781969070435,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7975643382352942,
"calib/avg_num_step_conf": 5.23046875,
"calib/ece": 0.15419354838709684,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.4314516129032258,
"calib/gap": 0.3781144957983193,
"calib/mean_conf": 0.6313709677419355,
"calib/mu_c": 0.8021323529411765,
"calib/mu_w": 0.42401785714285717,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.11858870967741941,
"calib/std_conf": 0.3685692360164819,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5448756906077348,
"calib/step_q_c_n": 724.0,
"calib/step_q_gap": 0.14255048735570225,
"calib/step_q_w": 0.4023252032520325,
"calib/step_q_w_n": 615.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2604.0,
"completions/max_terminated_length": 2604.0,
"completions/mean_length": 516.94921875,
"completions/mean_terminated_length": 516.94921875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.024730678647756577,
"kl": 0.1097259521484375,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0799,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03368227928876877,
"mask/share_reasoning": 0.854263424873352,
"mask/share_step_conf": 0.11205431818962097,
"num_tokens": 42431113.0,
"reward": 0.8051706552505493,
"reward_std": 0.18343767523765564,
"rewards/accuracy_reward_step": 0.53125,
"rewards/final_brier_reward_step": 0.7680890560150146,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8422523736953735,
"step": 179
},
{
"adv/mean_abs_final_conf": 0.7052978277206421,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7558290362358093,
"adv/std_final_conf": 0.8929066061973572,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348551034927368,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.7466370775008535,
"calib/avg_num_step_conf": 5.328125,
"calib/ece": 0.17609756097560975,
"calib/final_conf_rate": 0.9609375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.44715447154471544,
"calib/gap": 0.3255377261864116,
"calib/mean_conf": 0.6577235772357723,
"calib/mu_c": 0.7913793103448276,
"calib/mu_w": 0.465841584158416,
"calib/nonempty_final_conf_rate": 0.9609375,
"calib/nonempty_reasoning_rate": 0.984375,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.12219512195121952,
"calib/std_conf": 0.3662295539577348,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5244253632760898,
"calib/step_q_c_n": 757.0,
"calib/step_q_gap": 0.08030674713111446,
"calib/step_q_w": 0.44411861614497533,
"calib/step_q_w_n": 607.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2937.0,
"completions/max_terminated_length": 2937.0,
"completions/mean_length": 559.89453125,
"completions/mean_terminated_length": 562.0902099609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.192,
"grad_norm": 0.033511292189359665,
"kl": 0.1053466796875,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0235,
"mask/has_final_conf_rate": 0.9609375,
"mask/share_final_conf": 0.030846022069454193,
"mask/share_reasoning": 0.861070454120636,
"mask/share_step_conf": 0.1041773110628128,
"num_tokens": 42678302.0,
"reward": 0.7754788398742676,
"reward_std": 0.19080181419849396,
"rewards/accuracy_reward_step": 0.5703125,
"rewards/final_brier_reward_step": 0.742948055267334,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8080097436904907,
"step": 180
},
{
"adv/mean_abs_final_conf": 0.7309097051620483,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.761759877204895,
"adv/std_final_conf": 0.9095430970191956,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350034594535828,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7781721956886537,
"calib/avg_num_step_conf": 4.828125,
"calib/ece": 0.2001606425702812,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.94921875,
"calib/frac_conf_gt_0.9": 0.42168674698795183,
"calib/gap": 0.32375306570285284,
"calib/mean_conf": 0.6502008032128515,
"calib/mu_c": 0.8153278688524591,
"calib/mu_w": 0.4915748031496063,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.18020080321285148,
"calib/std_conf": 0.3559538910160631,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5650313479623824,
"calib/step_q_c_n": 638.0,
"calib/step_q_gap": 0.13148619746071016,
"calib/step_q_w": 0.43354515050167225,
"calib/step_q_w_n": 598.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2476.0,
"completions/max_terminated_length": 2476.0,
"completions/mean_length": 492.484375,
"completions/mean_terminated_length": 494.41571044921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.026930317282676697,
"kl": 0.12311553955078125,
"learning_rate": 5.277777777777779e-07,
"loss": -0.0894,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03484185039997101,
"mask/share_reasoning": 0.8484460115432739,
"mask/share_step_conf": 0.11280588805675507,
"num_tokens": 42910642.0,
"reward": 0.7667028307914734,
"reward_std": 0.1992136836051941,
"rewards/accuracy_reward_step": 0.4765625,
"rewards/final_brier_reward_step": 0.7242370843887329,
"rewards/format_reward_step": 0.94921875,
"rewards/step_l2_reward": 0.8091685175895691,
"step": 181
},
{
"adv/mean_abs_final_conf": 0.716362476348877,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7423475980758667,
"adv/std_final_conf": 0.8897429704666138,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351845979690552,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.7849022736338251,
"calib/avg_num_step_conf": 5.40625,
"calib/ece": 0.1755465587044534,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.48582995951417,
"calib/gap": 0.3501781677968355,
"calib/mean_conf": 0.6844534412955465,
"calib/mu_c": 0.8389855072463768,
"calib/mu_w": 0.48880733944954124,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.984375,
"calib/pce": 0.1506477732793522,
"calib/std_conf": 0.3539682573120276,
"calib/step_conf_rate": 0.984375,
"calib/step_q_c": 0.545406976744186,
"calib/step_q_c_n": 688.0,
"calib/step_q_gap": 0.14319433306602514,
"calib/step_q_w": 0.4022126436781609,
"calib/step_q_w_n": 696.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2946.0,
"completions/max_terminated_length": 2946.0,
"completions/mean_length": 521.8359375,
"completions/mean_terminated_length": 521.8359375,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.02837144397199154,
"kl": 0.110260009765625,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0358,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.03232816234230995,
"mask/share_reasoning": 0.8547617793083191,
"mask/share_step_conf": 0.11291007697582245,
"num_tokens": 43150392.0,
"reward": 0.7891018390655518,
"reward_std": 0.20950955152511597,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.7541359663009644,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8240677118301392,
"step": 182
},
{
"adv/mean_abs_final_conf": 0.6966094374656677,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7494475841522217,
"adv/std_final_conf": 0.8805187344551086,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.935265839099884,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.711265756302521,
"calib/avg_num_step_conf": 4.8671875,
"calib/ece": 0.23927125506072872,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.44129554655870445,
"calib/gap": 0.24173581932773103,
"calib/mean_conf": 0.6294736842105263,
"calib/mu_c": 0.7459375,
"calib/mu_w": 0.504201680672269,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.17526315789473682,
"calib/std_conf": 0.374413612133966,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5725888324873096,
"calib/step_q_c_n": 591.0,
"calib/step_q_gap": 0.16466516836517214,
"calib/step_q_w": 0.4079236641221375,
"calib/step_q_w_n": 655.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2304.0,
"completions/max_terminated_length": 2304.0,
"completions/mean_length": 516.7265625,
"completions/mean_terminated_length": 516.7265625,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.1952,
"grad_norm": 0.02867904305458069,
"kl": 0.1077117919921875,
"learning_rate": 4.7222222222222226e-07,
"loss": -0.1072,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.0338304303586483,
"mask/share_reasoning": 0.8622312545776367,
"mask/share_step_conf": 0.1039382666349411,
"num_tokens": 43389354.0,
"reward": 0.7498453259468079,
"reward_std": 0.2144506722688675,
"rewards/accuracy_reward_step": 0.5,
"rewards/final_brier_reward_step": 0.6893109679222107,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.8103797435760498,
"step": 183
},
{
"adv/mean_abs_final_conf": 0.722597599029541,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7299163937568665,
"adv/std_final_conf": 0.9212670922279358,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351829290390015,
"calib/answer_extract_rate": 0.98046875,
"calib/auroc": 0.7574013157894738,
"calib/avg_num_step_conf": 5.5234375,
"calib/ece": 0.1932142857142856,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.5357142857142857,
"calib/gap": 0.31155526315789495,
"calib/mean_conf": 0.7247222222222223,
"calib/mu_c": 0.8483552631578949,
"calib/mu_w": 0.5367999999999999,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.15738095238095226,
"calib/std_conf": 0.3386192819046213,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.5488351920693928,
"calib/step_q_c_n": 807.0,
"calib/step_q_gap": 0.10737555450761355,
"calib/step_q_w": 0.44145963756177925,
"calib/step_q_w_n": 607.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2298.0,
"completions/max_terminated_length": 2298.0,
"completions/mean_length": 507.171875,
"completions/mean_terminated_length": 507.171875,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.025500323623418808,
"kl": 0.1114654541015625,
"learning_rate": 4.444444444444445e-07,
"loss": -0.052,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.033292390406131744,
"mask/share_reasoning": 0.8502817749977112,
"mask/share_step_conf": 0.11642585694789886,
"num_tokens": 43624470.0,
"reward": 0.794865608215332,
"reward_std": 0.21812523901462555,
"rewards/accuracy_reward_step": 0.59375,
"rewards/final_brier_reward_step": 0.756519079208374,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8332120776176453,
"step": 184
},
{
"adv/mean_abs_final_conf": 0.6467674970626831,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7386155724525452,
"adv/std_final_conf": 0.8718067407608032,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341294169425964,
"calib/answer_extract_rate": 0.9375,
"calib/auroc": 0.8350427350427351,
"calib/avg_num_step_conf": 5.50390625,
"calib/ece": 0.1701673640167365,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.5439330543933054,
"calib/gap": 0.44017663817663805,
"calib/mean_conf": 0.7178661087866108,
"calib/mu_c": 0.9094074074074073,
"calib/mu_w": 0.4692307692307693,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1615899581589959,
"calib/std_conf": 0.36194160117195295,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.593742774566474,
"calib/step_q_c_n": 692.0,
"calib/step_q_gap": 0.22329647052184354,
"calib/step_q_w": 0.3704463040446304,
"calib/step_q_w_n": 717.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3029.0,
"completions/max_terminated_length": 3029.0,
"completions/mean_length": 556.5,
"completions/mean_terminated_length": 558.682373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.027720289304852486,
"kl": 0.09789276123046875,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0947,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.03240993991494179,
"mask/share_reasoning": 0.8568481802940369,
"mask/share_step_conf": 0.10683561861515045,
"num_tokens": 43873854.0,
"reward": 0.7714694738388062,
"reward_std": 0.19280867278575897,
"rewards/accuracy_reward_step": 0.52734375,
"rewards/final_brier_reward_step": 0.7566031217575073,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.786335825920105,
"step": 185
},
{
"adv/mean_abs_final_conf": 0.6654640436172485,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7512376308441162,
"adv/std_final_conf": 0.8803311586380005,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341305494308472,
"calib/answer_extract_rate": 0.984375,
"calib/auroc": 0.7729294231388978,
"calib/avg_num_step_conf": 5.51953125,
"calib/ece": 0.17456349206349214,
"calib/final_conf_rate": 0.984375,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.48412698412698413,
"calib/gap": 0.3772716725749276,
"calib/mean_conf": 0.6609126984126985,
"calib/mu_c": 0.8211034482758621,
"calib/mu_w": 0.4438317757009345,
"calib/nonempty_final_conf_rate": 0.984375,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.1300396825396826,
"calib/std_conf": 0.3737470607166107,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5759733333333333,
"calib/step_q_c_n": 750.0,
"calib/step_q_gap": 0.16490244343891408,
"calib/step_q_w": 0.41107088989441926,
"calib/step_q_w_n": 663.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2543.0,
"completions/max_terminated_length": 2543.0,
"completions/mean_length": 513.26953125,
"completions/mean_terminated_length": 513.26953125,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.1984,
"grad_norm": 0.03017481043934822,
"kl": 0.1173858642578125,
"learning_rate": 3.8888888888888895e-07,
"loss": -0.0154,
"mask/has_final_conf_rate": 0.984375,
"mask/share_final_conf": 0.03294532373547554,
"mask/share_reasoning": 0.8512404561042786,
"mask/share_step_conf": 0.115814208984375,
"num_tokens": 44110291.0,
"reward": 0.8168442249298096,
"reward_std": 0.16611357033252716,
"rewards/accuracy_reward_step": 0.56640625,
"rewards/final_brier_reward_step": 0.7806402444839478,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8530482053756714,
"step": 186
},
{
"adv/mean_abs_final_conf": 0.7587389945983887,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7640891075134277,
"adv/std_final_conf": 0.9130141139030457,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348838329315186,
"calib/answer_extract_rate": 0.9296875,
"calib/auroc": 0.7465395480225988,
"calib/avg_num_step_conf": 5.296875,
"calib/ece": 0.24495798319327733,
"calib/final_conf_rate": 0.9296875,
"calib/format_rate": 0.921875,
"calib/frac_conf_gt_0.9": 0.5042016806722689,
"calib/gap": 0.29432062146892646,
"calib/mean_conf": 0.7091596638655461,
"calib/mu_c": 0.8550833333333333,
"calib/mu_w": 0.5607627118644068,
"calib/nonempty_final_conf_rate": 0.9296875,
"calib/nonempty_reasoning_rate": 0.9609375,
"calib/nonempty_step_conf_rate": 0.953125,
"calib/pce": 0.22495798319327734,
"calib/std_conf": 0.33363437903968757,
"calib/step_conf_rate": 0.953125,
"calib/step_q_c": 0.5535940099833612,
"calib/step_q_c_n": 601.0,
"calib/step_q_gap": 0.1172231490562089,
"calib/step_q_w": 0.43637086092715227,
"calib/step_q_w_n": 755.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3035.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 592.921875,
"completions/mean_terminated_length": 595.2470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.04078133404254913,
"kl": 0.12639617919921875,
"learning_rate": 3.611111111111111e-07,
"loss": -0.0517,
"mask/has_final_conf_rate": 0.9296875,
"mask/share_final_conf": 0.029552815482020378,
"mask/share_reasoning": 0.865035891532898,
"mask/share_step_conf": 0.10150502622127533,
"num_tokens": 44363623.0,
"reward": 0.7370548248291016,
"reward_std": 0.24327854812145233,
"rewards/accuracy_reward_step": 0.46875,
"rewards/final_brier_reward_step": 0.6864898204803467,
"rewards/format_reward_step": 0.921875,
"rewards/step_l2_reward": 0.7876197099685669,
"step": 187
},
{
"adv/mean_abs_final_conf": 0.741357147693634,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7504744529724121,
"adv/std_final_conf": 0.8928669691085815,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348607063293457,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.7147405175955087,
"calib/avg_num_step_conf": 5.5390625,
"calib/ece": 0.23893004115226332,
"calib/final_conf_rate": 0.94921875,
"calib/format_rate": 0.9453125,
"calib/frac_conf_gt_0.9": 0.4732510288065844,
"calib/gap": 0.23204162672874173,
"calib/mean_conf": 0.6762139917695473,
"calib/mu_c": 0.7802985074626867,
"calib/mu_w": 0.548256880733945,
"calib/nonempty_final_conf_rate": 0.94921875,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96875,
"calib/pce": 0.1818518518518518,
"calib/std_conf": 0.3621910738279023,
"calib/step_conf_rate": 0.96875,
"calib/step_q_c": 0.5239970501474926,
"calib/step_q_c_n": 678.0,
"calib/step_q_gap": 0.09524029339073592,
"calib/step_q_w": 0.42875675675675673,
"calib/step_q_w_n": 740.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 581.21484375,
"completions/mean_terminated_length": 583.494140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.025433626025915146,
"kl": 0.0932159423828125,
"learning_rate": 3.3333333333333335e-07,
"loss": -0.0809,
"mask/has_final_conf_rate": 0.94921875,
"mask/share_final_conf": 0.032101910561323166,
"mask/share_reasoning": 0.8519271612167358,
"mask/share_step_conf": 0.11206468939781189,
"num_tokens": 44616486.0,
"reward": 0.7399386167526245,
"reward_std": 0.21706843376159668,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.6801941394805908,
"rewards/format_reward_step": 0.9453125,
"rewards/step_l2_reward": 0.7996830940246582,
"step": 188
},
{
"adv/mean_abs_final_conf": 0.7388796210289001,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7620296478271484,
"adv/std_final_conf": 0.9037927985191345,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347342848777771,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7080489763147331,
"calib/avg_num_step_conf": 5.09375,
"calib/ece": 0.21684210526315784,
"calib/final_conf_rate": 0.96484375,
"calib/format_rate": 0.96484375,
"calib/frac_conf_gt_0.9": 0.39271255060728744,
"calib/gap": 0.24630603505954773,
"calib/mean_conf": 0.5980566801619435,
"calib/mu_c": 0.703758865248227,
"calib/mu_w": 0.4574528301886792,
"calib/nonempty_final_conf_rate": 0.96484375,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.98046875,
"calib/pce": 0.12202429149797565,
"calib/std_conf": 0.3709767565122908,
"calib/step_conf_rate": 0.98046875,
"calib/step_q_c": 0.4952091254752851,
"calib/step_q_c_n": 789.0,
"calib/step_q_gap": 0.0966848536306249,
"calib/step_q_w": 0.3985242718446602,
"calib/step_q_w_n": 515.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2199.0,
"completions/max_terminated_length": 2199.0,
"completions/mean_length": 493.4765625,
"completions/mean_terminated_length": 493.4765625,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.2016,
"grad_norm": 0.024370839819312096,
"kl": 0.1257476806640625,
"learning_rate": 3.055555555555556e-07,
"loss": -0.085,
"mask/has_final_conf_rate": 0.96484375,
"mask/share_final_conf": 0.035693950951099396,
"mask/share_reasoning": 0.8530647158622742,
"mask/share_step_conf": 0.11124132573604584,
"num_tokens": 44850584.0,
"reward": 0.7606855630874634,
"reward_std": 0.18621540069580078,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.7114140391349792,
"rewards/format_reward_step": 0.96484375,
"rewards/step_l2_reward": 0.8099571466445923,
"step": 189
},
{
"adv/mean_abs_final_conf": 0.7214258313179016,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7243682742118835,
"adv/std_final_conf": 0.9232921004295349,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349976181983948,
"calib/answer_extract_rate": 0.9765625,
"calib/auroc": 0.8055611055611055,
"calib/avg_num_step_conf": 5.5625,
"calib/ece": 0.13370967741935494,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.39919354838709675,
"calib/gap": 0.38047952047952055,
"calib/mean_conf": 0.6365322580645162,
"calib/mu_c": 0.7976223776223776,
"calib/mu_w": 0.4171428571428571,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.98828125,
"calib/nonempty_step_conf_rate": 0.97265625,
"calib/pce": 0.09681451612903236,
"calib/std_conf": 0.3574552630588191,
"calib/step_conf_rate": 0.97265625,
"calib/step_q_c": 0.5187714987714988,
"calib/step_q_c_n": 814.0,
"calib/step_q_gap": 0.14439936762395778,
"calib/step_q_w": 0.374372131147541,
"calib/step_q_w_n": 610.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2608.0,
"completions/max_terminated_length": 2608.0,
"completions/mean_length": 571.48828125,
"completions/mean_terminated_length": 571.48828125,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.03225598856806755,
"kl": 0.09828948974609375,
"learning_rate": 2.7777777777777776e-07,
"loss": -0.0619,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.030426179990172386,
"mask/share_reasoning": 0.8637335300445557,
"mask/share_step_conf": 0.10584026575088501,
"num_tokens": 45102493.0,
"reward": 0.8016316294670105,
"reward_std": 0.20212367177009583,
"rewards/accuracy_reward_step": 0.55859375,
"rewards/final_brier_reward_step": 0.7732390761375427,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.830024242401123,
"step": 190
},
{
"adv/mean_abs_final_conf": 0.6836140751838684,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7573249340057373,
"adv/std_final_conf": 0.8904557228088379,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9353086352348328,
"calib/answer_extract_rate": 0.96875,
"calib/auroc": 0.7221316818774447,
"calib/avg_num_step_conf": 5.92578125,
"calib/ece": 0.2806854838709677,
"calib/final_conf_rate": 0.96875,
"calib/format_rate": 0.9609375,
"calib/frac_conf_gt_0.9": 0.5524193548387096,
"calib/gap": 0.26793611473272494,
"calib/mean_conf": 0.7280241935483871,
"calib/mu_c": 0.8684745762711864,
"calib/mu_w": 0.6005384615384615,
"calib/nonempty_final_conf_rate": 0.96875,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.98828125,
"calib/pce": 0.2664516129032258,
"calib/std_conf": 0.3429928280005771,
"calib/step_conf_rate": 0.98828125,
"calib/step_q_c": 0.5900663349917082,
"calib/step_q_c_n": 603.0,
"calib/step_q_gap": 0.17962869823022032,
"calib/step_q_w": 0.4104376367614879,
"calib/step_q_w_n": 914.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 533.52734375,
"completions/mean_terminated_length": 533.52734375,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.03718110918998718,
"kl": 0.1095123291015625,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0399,
"mask/has_final_conf_rate": 0.96875,
"mask/share_final_conf": 0.03474542871117592,
"mask/share_reasoning": 0.8406577706336975,
"mask/share_step_conf": 0.12459678947925568,
"num_tokens": 45343244.0,
"reward": 0.7328799962997437,
"reward_std": 0.1909710317850113,
"rewards/accuracy_reward_step": 0.4609375,
"rewards/final_brier_reward_step": 0.6795296669006348,
"rewards/format_reward_step": 0.9609375,
"rewards/step_l2_reward": 0.7862304449081421,
"step": 191
},
{
"adv/mean_abs_final_conf": 0.7038275003433228,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7256333827972412,
"adv/std_final_conf": 0.8924391269683838,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348781704902649,
"calib/answer_extract_rate": 0.94140625,
"calib/auroc": 0.79156223893066,
"calib/avg_num_step_conf": 4.93359375,
"calib/ece": 0.16763485477178416,
"calib/final_conf_rate": 0.94140625,
"calib/format_rate": 0.9296875,
"calib/frac_conf_gt_0.9": 0.44398340248962653,
"calib/gap": 0.3649143692564744,
"calib/mean_conf": 0.6344398340248963,
"calib/mu_c": 0.79796992481203,
"calib/mu_w": 0.4330555555555556,
"calib/nonempty_final_conf_rate": 0.94140625,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.12510373443983397,
"calib/std_conf": 0.37149886703162577,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5448253968253969,
"calib/step_q_c_n": 630.0,
"calib/step_q_gap": 0.15837989919506512,
"calib/step_q_w": 0.38644549763033176,
"calib/step_q_w_n": 633.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2567.0,
"completions/max_terminated_length": 2567.0,
"completions/mean_length": 531.1328125,
"completions/mean_terminated_length": 533.2156982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.2048,
"grad_norm": 0.030240915715694427,
"kl": 0.1117401123046875,
"learning_rate": 2.2222222222222224e-07,
"loss": -0.1492,
"mask/has_final_conf_rate": 0.94140625,
"mask/share_final_conf": 0.034587934613227844,
"mask/share_reasoning": 0.852296769618988,
"mask/share_step_conf": 0.10920904576778412,
"num_tokens": 45584190.0,
"reward": 0.7673456072807312,
"reward_std": 0.2374992072582245,
"rewards/accuracy_reward_step": 0.5234375,
"rewards/final_brier_reward_step": 0.735599160194397,
"rewards/format_reward_step": 0.9296875,
"rewards/step_l2_reward": 0.7990920543670654,
"step": 192
},
{
"adv/mean_abs_final_conf": 0.7664552927017212,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7637758851051331,
"adv/std_final_conf": 0.9189733266830444,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9349616169929504,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.775987861570248,
"calib/avg_num_step_conf": 5.12109375,
"calib/ece": 0.2009638554216867,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.3855421686746988,
"calib/gap": 0.3164585485537191,
"calib/mean_conf": 0.635421686746988,
"calib/mu_c": 0.7980991735537191,
"calib/mu_w": 0.481640625,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.1752208835341365,
"calib/std_conf": 0.35633442125431675,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5100494233937397,
"calib/step_q_c_n": 607.0,
"calib/step_q_gap": 0.09916874157555788,
"calib/step_q_w": 0.4108806818181818,
"calib/step_q_w_n": 704.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2898.0,
"completions/max_terminated_length": 2898.0,
"completions/mean_length": 549.03515625,
"completions/mean_terminated_length": 549.03515625,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.04683626815676689,
"kl": 0.1035919189453125,
"learning_rate": 1.9444444444444447e-07,
"loss": -0.1273,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.032026469707489014,
"mask/share_reasoning": 0.8625136613845825,
"mask/share_step_conf": 0.10545986890792847,
"num_tokens": 45830455.0,
"reward": 0.7765910625457764,
"reward_std": 0.20513297617435455,
"rewards/accuracy_reward_step": 0.47265625,
"rewards/final_brier_reward_step": 0.7382304668426514,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8149516582489014,
"step": 193
},
{
"adv/mean_abs_final_conf": 0.7188024520874023,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7388456463813782,
"adv/std_final_conf": 0.8925259113311768,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9348104000091553,
"calib/answer_extract_rate": 0.96484375,
"calib/auroc": 0.8354273044880408,
"calib/avg_num_step_conf": 4.88671875,
"calib/ece": 0.2031967213114754,
"calib/final_conf_rate": 0.953125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.48770491803278687,
"calib/gap": 0.4008196721311476,
"calib/mean_conf": 0.666311475409836,
"calib/mu_c": 0.8667213114754099,
"calib/mu_w": 0.46590163934426226,
"calib/nonempty_final_conf_rate": 0.953125,
"calib/nonempty_reasoning_rate": 0.9765625,
"calib/nonempty_step_conf_rate": 0.9609375,
"calib/pce": 0.18475409836065573,
"calib/std_conf": 0.36803911236303505,
"calib/step_conf_rate": 0.9609375,
"calib/step_q_c": 0.5610172143974961,
"calib/step_q_c_n": 639.0,
"calib/step_q_gap": 0.1487296326981497,
"calib/step_q_w": 0.4122875816993464,
"calib/step_q_w_n": 612.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2607.0,
"completions/max_terminated_length": 2607.0,
"completions/mean_length": 498.07421875,
"completions/mean_terminated_length": 500.0274658203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.024641873314976692,
"kl": 0.105682373046875,
"learning_rate": 1.6666666666666668e-07,
"loss": -0.1451,
"mask/has_final_conf_rate": 0.953125,
"mask/share_final_conf": 0.034925445914268494,
"mask/share_reasoning": 0.8536133766174316,
"mask/share_step_conf": 0.10755492746829987,
"num_tokens": 46063906.0,
"reward": 0.7782323360443115,
"reward_std": 0.2255384773015976,
"rewards/accuracy_reward_step": 0.48046875,
"rewards/final_brier_reward_step": 0.7424625158309937,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.814002275466919,
"step": 194
},
{
"adv/mean_abs_final_conf": 0.7369129657745361,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7271870374679565,
"adv/std_final_conf": 0.9174585938453674,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9350939989089966,
"calib/answer_extract_rate": 0.93359375,
"calib/auroc": 0.7756257030371204,
"calib/avg_num_step_conf": 5.30078125,
"calib/ece": 0.16782426778242684,
"calib/final_conf_rate": 0.93359375,
"calib/format_rate": 0.92578125,
"calib/frac_conf_gt_0.9": 0.38493723849372385,
"calib/gap": 0.364064257592801,
"calib/mean_conf": 0.6210460251046024,
"calib/mu_c": 0.7916535433070867,
"calib/mu_w": 0.4275892857142857,
"calib/nonempty_final_conf_rate": 0.93359375,
"calib/nonempty_reasoning_rate": 0.97265625,
"calib/nonempty_step_conf_rate": 0.96484375,
"calib/pce": 0.12874476987447703,
"calib/std_conf": 0.36755916891238644,
"calib/step_conf_rate": 0.96484375,
"calib/step_q_c": 0.5499426111908179,
"calib/step_q_c_n": 697.0,
"calib/step_q_gap": 0.16221533846354513,
"calib/step_q_w": 0.38772727272727275,
"calib/step_q_w_n": 660.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2461.0,
"completions/max_terminated_length": 2461.0,
"completions/mean_length": 534.41015625,
"completions/mean_terminated_length": 540.7470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.208,
"grad_norm": 0.024135705083608627,
"kl": 0.109375,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.1971,
"mask/has_final_conf_rate": 0.93359375,
"mask/share_final_conf": 0.03207322955131531,
"mask/share_reasoning": 0.8431376814842224,
"mask/share_step_conf": 0.11307032406330109,
"num_tokens": 46306699.0,
"reward": 0.757156252861023,
"reward_std": 0.22981423139572144,
"rewards/accuracy_reward_step": 0.49609375,
"rewards/final_brier_reward_step": 0.7289426326751709,
"rewards/format_reward_step": 0.92578125,
"rewards/step_l2_reward": 0.7853699326515198,
"step": 195
},
{
"adv/mean_abs_final_conf": 0.634067177772522,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7509913444519043,
"adv/std_final_conf": 0.8568524718284607,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9345593452453613,
"calib/answer_extract_rate": 0.98828125,
"calib/auroc": 0.7871885225270576,
"calib/avg_num_step_conf": 4.859375,
"calib/ece": 0.1885770750988143,
"calib/final_conf_rate": 0.98828125,
"calib/format_rate": 0.984375,
"calib/frac_conf_gt_0.9": 0.47035573122529645,
"calib/gap": 0.33918701233324944,
"calib/mean_conf": 0.708498023715415,
"calib/mu_c": 0.8640145985401461,
"calib/mu_w": 0.5248275862068966,
"calib/nonempty_final_conf_rate": 0.98828125,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.17778656126482223,
"calib/std_conf": 0.3414930554390387,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5664759036144578,
"calib/step_q_c_n": 664.0,
"calib/step_q_gap": 0.10759659326963023,
"calib/step_q_w": 0.45887931034482754,
"calib/step_q_w_n": 580.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2930.0,
"completions/max_terminated_length": 2930.0,
"completions/mean_length": 445.6484375,
"completions/mean_terminated_length": 445.6484375,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.025791611522436142,
"kl": 0.1186065673828125,
"learning_rate": 1.1111111111111112e-07,
"loss": -0.0162,
"mask/has_final_conf_rate": 0.98828125,
"mask/share_final_conf": 0.037724819034338,
"mask/share_reasoning": 0.8455040454864502,
"mask/share_step_conf": 0.11677109450101852,
"num_tokens": 46523329.0,
"reward": 0.7994219064712524,
"reward_std": 0.15122339129447937,
"rewards/accuracy_reward_step": 0.53515625,
"rewards/final_brier_reward_step": 0.7627730369567871,
"rewards/format_reward_step": 0.984375,
"rewards/step_l2_reward": 0.8360706567764282,
"step": 196
},
{
"adv/mean_abs_final_conf": 0.7248457670211792,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7576488256454468,
"adv/std_final_conf": 0.9057278633117676,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9341248869895935,
"calib/answer_extract_rate": 0.97265625,
"calib/auroc": 0.8298182998053212,
"calib/avg_num_step_conf": 5.875,
"calib/ece": 0.16755020080321287,
"calib/final_conf_rate": 0.97265625,
"calib/format_rate": 0.97265625,
"calib/frac_conf_gt_0.9": 0.37751004016064255,
"calib/gap": 0.39027644386761834,
"calib/mean_conf": 0.6077108433734939,
"calib/mu_c": 0.8177391304347825,
"calib/mu_w": 0.4274626865671642,
"calib/nonempty_final_conf_rate": 0.97265625,
"calib/nonempty_reasoning_rate": 0.99609375,
"calib/nonempty_step_conf_rate": 0.99609375,
"calib/pce": 0.15670682730923696,
"calib/std_conf": 0.36246808573389455,
"calib/step_conf_rate": 0.99609375,
"calib/step_q_c": 0.5408744038155803,
"calib/step_q_c_n": 629.0,
"calib/step_q_gap": 0.14510297524415178,
"calib/step_q_w": 0.3957714285714285,
"calib/step_q_w_n": 875.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3004.0,
"completions/max_terminated_length": 3004.0,
"completions/mean_length": 539.18359375,
"completions/mean_terminated_length": 541.298095703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.029260681942105293,
"kl": 0.1084442138671875,
"learning_rate": 8.333333333333334e-08,
"loss": -0.0653,
"mask/has_final_conf_rate": 0.97265625,
"mask/share_final_conf": 0.03204625844955444,
"mask/share_reasoning": 0.8442831635475159,
"mask/share_step_conf": 0.11976435035467148,
"num_tokens": 46766416.0,
"reward": 0.806782603263855,
"reward_std": 0.16487516462802887,
"rewards/accuracy_reward_step": 0.44921875,
"rewards/final_brier_reward_step": 0.7711203098297119,
"rewards/format_reward_step": 0.97265625,
"rewards/step_l2_reward": 0.8424450159072876,
"step": 197
},
{
"adv/mean_abs_final_conf": 0.7016639709472656,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7586361765861511,
"adv/std_final_conf": 0.8786138296127319,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9339083433151245,
"calib/answer_extract_rate": 0.95703125,
"calib/auroc": 0.8085466612488149,
"calib/avg_num_step_conf": 5.625,
"calib/ece": 0.13453061224489798,
"calib/final_conf_rate": 0.95703125,
"calib/format_rate": 0.95703125,
"calib/frac_conf_gt_0.9": 0.4489795918367347,
"calib/gap": 0.3924190708384126,
"calib/mean_conf": 0.6554285714285714,
"calib/mu_c": 0.8268115942028986,
"calib/mu_w": 0.434392523364486,
"calib/nonempty_final_conf_rate": 0.95703125,
"calib/nonempty_reasoning_rate": 0.9921875,
"calib/nonempty_step_conf_rate": 0.9921875,
"calib/pce": 0.11334693877551025,
"calib/std_conf": 0.3682018080191506,
"calib/step_conf_rate": 0.9921875,
"calib/step_q_c": 0.5540152477763659,
"calib/step_q_c_n": 787.0,
"calib/step_q_gap": 0.17266762143639658,
"calib/step_q_w": 0.3813476263399693,
"calib/step_q_w_n": 653.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 495.03125,
"completions/mean_terminated_length": 500.9012145996094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.2112,
"grad_norm": 0.029695816338062286,
"kl": 0.1151885986328125,
"learning_rate": 5.555555555555556e-08,
"loss": -0.1509,
"mask/has_final_conf_rate": 0.95703125,
"mask/share_final_conf": 0.034423746168613434,
"mask/share_reasoning": 0.8307321667671204,
"mask/share_step_conf": 0.12312532961368561,
"num_tokens": 46998528.0,
"reward": 0.8024085760116577,
"reward_std": 0.18739835917949677,
"rewards/accuracy_reward_step": 0.5390625,
"rewards/final_brier_reward_step": 0.768500030040741,
"rewards/format_reward_step": 0.95703125,
"rewards/step_l2_reward": 0.8363170623779297,
"step": 198
},
{
"adv/mean_abs_final_conf": 0.7539445161819458,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7357039451599121,
"adv/std_final_conf": 0.9093256592750549,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9351595044136047,
"calib/answer_extract_rate": 0.94921875,
"calib/auroc": 0.6866793062284952,
"calib/avg_num_step_conf": 5.3125,
"calib/ece": 0.23665289256198357,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.4132231404958678,
"calib/gap": 0.2108552770170634,
"calib/mean_conf": 0.6543388429752066,
"calib/mu_c": 0.7423404255319148,
"calib/mu_w": 0.5314851485148514,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.15417355371900834,
"calib/std_conf": 0.35539376494340014,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.51625,
"calib/step_q_c_n": 712.0,
"calib/step_q_gap": 0.10226851851851854,
"calib/step_q_w": 0.41398148148148145,
"calib/step_q_w_n": 648.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2683.0,
"completions/max_terminated_length": 2683.0,
"completions/mean_length": 559.0546875,
"completions/mean_terminated_length": 565.683837890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.027883553877472878,
"kl": 0.1039276123046875,
"learning_rate": 2.777777777777778e-08,
"loss": -0.1335,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03248848393559456,
"mask/share_reasoning": 0.845879316329956,
"mask/share_step_conf": 0.1099134162068367,
"num_tokens": 47245846.0,
"reward": 0.7473338842391968,
"reward_std": 0.2364819198846817,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.6842191219329834,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8104487061500549,
"step": 199
},
{
"adv/mean_abs_final_conf": 0.6555727124214172,
"adv/mean_abs_reasoning": 0.0,
"adv/mean_abs_step_conf": 0.7696542143821716,
"adv/std_final_conf": 0.8538159728050232,
"adv/std_reasoning": 0.0,
"adv/std_step_conf": 0.9347708821296692,
"calib/answer_extract_rate": 0.953125,
"calib/auroc": 0.9080633802816902,
"calib/avg_num_step_conf": 4.9765625,
"calib/ece": 0.1094214876033059,
"calib/final_conf_rate": 0.9453125,
"calib/format_rate": 0.94140625,
"calib/frac_conf_gt_0.9": 0.5247933884297521,
"calib/gap": 0.5623704225352111,
"calib/mean_conf": 0.6676859504132231,
"calib/mu_c": 0.9000704225352111,
"calib/mu_w": 0.33770000000000006,
"calib/nonempty_final_conf_rate": 0.9453125,
"calib/nonempty_reasoning_rate": 0.98046875,
"calib/nonempty_step_conf_rate": 0.9765625,
"calib/pce": 0.09516528925619847,
"calib/std_conf": 0.3811872716705017,
"calib/step_conf_rate": 0.9765625,
"calib/step_q_c": 0.5982966226138032,
"calib/step_q_c_n": 681.0,
"calib/step_q_gap": 0.24617183340638332,
"calib/step_q_w": 0.3521247892074199,
"calib/step_q_w_n": 593.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2614.0,
"completions/max_terminated_length": 2614.0,
"completions/mean_length": 556.3125,
"completions/mean_terminated_length": 560.6929321289062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.0319523885846138,
"kl": 0.0972442626953125,
"learning_rate": 0.0,
"loss": -0.0845,
"mask/has_final_conf_rate": 0.9453125,
"mask/share_final_conf": 0.03294922411441803,
"mask/share_reasoning": 0.8577835559844971,
"mask/share_step_conf": 0.10145474225282669,
"num_tokens": 47496310.0,
"reward": 0.8319915533065796,
"reward_std": 0.17969706654548645,
"rewards/accuracy_reward_step": 0.5546875,
"rewards/final_brier_reward_step": 0.8264539241790771,
"rewards/format_reward_step": 0.94140625,
"rewards/step_l2_reward": 0.8375290632247925,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.09220705164130777,
"train_runtime": 12915.8932,
"train_samples_per_second": 3.964,
"train_steps_per_second": 0.015
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 47496310,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}