Files
Qwen2.5-3B-MATH-GRPO/checkpoint-1170/trainer_state.json
ModelHub XC e8adfe85ab 初始化项目,由ModelHub XC社区提供模型
Model: cheongmyeong17/Qwen2.5-3B-MATH-GRPO
Source: Original Platform
2026-05-26 01:59:17 +08:00

25775 lines
898 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998932764140875,
"eval_steps": 500,
"global_step": 1170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.6185625195503235,
"advantages/var": 0.3826195905924443,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.546875,
"epoch": 0.004268943436499467,
"grad_norm": 0.2276165733924732,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 183035.0,
"reward": 0.5859375,
"reward_std": 0.17742186784744263,
"rewards/drgrpo_math_reward/mean": 0.5859375,
"rewards/drgrpo_math_reward/std": 0.4935242533683777,
"step": 1
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.2525086942753754e-09,
"advantages/std": 0.5726795196533203,
"advantages/var": 0.3279618322303577,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.008537886872998933,
"grad_norm": 0.226357800614887,
"learning_rate": 9.999982128386562e-07,
"loss": -0.0,
"num_tokens": 361873.0,
"reward": 0.56640625,
"reward_std": 0.1553223431110382,
"rewards/drgrpo_math_reward/mean": 0.56640625,
"rewards/drgrpo_math_reward/std": 0.4965413510799408,
"step": 2
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 7.4215683647132815e-09,
"advantages/std": 0.5960710644721985,
"advantages/var": 0.3553007139010198,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 0.012806830309498399,
"grad_norm": 0.19563640777871955,
"learning_rate": 9.999928513674003e-07,
"loss": 0.0,
"num_tokens": 552736.0,
"reward": 0.5390625,
"reward_std": 0.17609265446662903,
"rewards/drgrpo_math_reward/mean": 0.5390625,
"rewards/drgrpo_math_reward/std": 0.4994482398033142,
"step": 3
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.906189388529494e-09,
"advantages/std": 0.5960556864738464,
"advantages/var": 0.3552823813778083,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 0.017075773745997867,
"grad_norm": 0.22884223901736953,
"learning_rate": 9.999839156245597e-07,
"loss": 0.0,
"num_tokens": 715125.0,
"reward": 0.63671875,
"reward_std": 0.1573006510734558,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 4
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.520896926640372e-09,
"advantages/std": 0.6612821817398071,
"advantages/var": 0.4372941238865593,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.021344717182497332,
"grad_norm": 0.22527385543570674,
"learning_rate": 9.999714056740128e-07,
"loss": -0.0,
"num_tokens": 889019.0,
"reward": 0.7265625,
"reward_std": 0.21778054535388947,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 5
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.171810413586495e-09,
"advantages/std": 0.5960792899131775,
"advantages/var": 0.3553105198633979,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.390625,
"epoch": 0.025613660618996798,
"grad_norm": 0.2126144087440947,
"learning_rate": 9.99955321605189e-07,
"loss": 0.0,
"num_tokens": 1074157.0,
"reward": 0.5,
"reward_std": 0.18634238839149475,
"rewards/drgrpo_math_reward/mean": 0.5,
"rewards/drgrpo_math_reward/std": 0.5009794235229492,
"step": 6
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.065588475315301e-09,
"advantages/std": 0.5726861953735352,
"advantages/var": 0.3279694783714149,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 0.029882604055496264,
"grad_norm": 0.1897445781841252,
"learning_rate": 9.999356635330673e-07,
"loss": -0.0,
"num_tokens": 1234275.0,
"reward": 0.7265625,
"reward_std": 0.1626875400543213,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 7
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 4.5771540403460145e-09,
"advantages/std": 0.6612839102745056,
"advantages/var": 0.4372964099879404,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 0.03415154749199573,
"grad_norm": 0.23483356980828468,
"learning_rate": 9.999124315981764e-07,
"loss": 0.0,
"num_tokens": 1398228.0,
"reward": 0.671875,
"reward_std": 0.2210792601108551,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 8
},
{
"advantages/mean": 5.122274160385132e-09,
"advantages/snr": 7.99997790566197e-09,
"advantages/std": 0.6402860283851624,
"advantages/var": 0.40996619814524493,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.296875,
"epoch": 0.0384204909284952,
"grad_norm": 0.3110548918586148,
"learning_rate": 9.998856259665933e-07,
"loss": -0.0,
"num_tokens": 1578412.0,
"reward": 0.5625,
"reward_std": 0.20726242661476135,
"rewards/drgrpo_math_reward/mean": 0.5625,
"rewards/drgrpo_math_reward/std": 0.49705013632774353,
"step": 9
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 8.917455869309804e-09,
"advantages/std": 0.5483002662658691,
"advantages/var": 0.300633181987223,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 0.042689434364994665,
"grad_norm": 1.5465370680042207,
"learning_rate": 9.99855246829942e-07,
"loss": 0.0,
"num_tokens": 1733286.0,
"reward": 0.76171875,
"reward_std": 0.1437433660030365,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 10
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 4.516702186066849e-09,
"advantages/std": 0.6185857653617859,
"advantages/var": 0.3826483491082264,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.4375,
"epoch": 0.04695837780149413,
"grad_norm": 0.2519428429352001,
"learning_rate": 9.998212944053918e-07,
"loss": 0.0,
"num_tokens": 1918742.0,
"reward": 0.51171875,
"reward_std": 0.20752444863319397,
"rewards/drgrpo_math_reward/mean": 0.51171875,
"rewards/drgrpo_math_reward/std": 0.5008418560028076,
"step": 11
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 6.337529925466492e-09,
"advantages/std": 0.6612910032272339,
"advantages/var": 0.43730579094928146,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.051227321237993596,
"grad_norm": 0.2167273292177543,
"learning_rate": 9.997837689356569e-07,
"loss": 0.0,
"num_tokens": 2090431.0,
"reward": 0.65234375,
"reward_std": 0.23079858720302582,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 12
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 1.0163927816352007e-08,
"advantages/std": 0.572688639163971,
"advantages/var": 0.3279722774274809,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 0.05549626467449306,
"grad_norm": 0.2485293884209483,
"learning_rate": 9.997426706889933e-07,
"loss": 0.0,
"num_tokens": 2244232.0,
"reward": 0.64453125,
"reward_std": 0.16663289070129395,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 13
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.7814483444527136e-09,
"advantages/std": 0.5227895379066467,
"advantages/var": 0.2733089009446452,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.05976520811099253,
"grad_norm": 0.20260901830215677,
"learning_rate": 9.99697999959198e-07,
"loss": -0.0,
"num_tokens": 2400663.0,
"reward": 0.7109375,
"reward_std": 0.13664263486862183,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 14
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.6363342984850635e-09,
"advantages/std": 0.6402894258499146,
"advantages/var": 0.4099705488552132,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.546875,
"epoch": 0.064034151547492,
"grad_norm": 0.25104401322218795,
"learning_rate": 9.996497570656062e-07,
"loss": 0.0,
"num_tokens": 2578690.0,
"reward": 0.63671875,
"reward_std": 0.21344566345214844,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 15
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.123630759509537e-09,
"advantages/std": 0.6816376447677612,
"advantages/var": 0.46462987876454065,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.46875,
"epoch": 0.06830309498399147,
"grad_norm": 0.2337666130483442,
"learning_rate": 9.995979423530892e-07,
"loss": 0.0,
"num_tokens": 2759120.0,
"reward": 0.640625,
"reward_std": 0.23778307437896729,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 16
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 3.149247399126245e-10,
"advantages/std": 0.7393215298652649,
"advantages/var": 0.5465963245223158,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 0.07257203842049093,
"grad_norm": 0.3111236732518641,
"learning_rate": 9.99542556192052e-07,
"loss": 0.0,
"num_tokens": 2941175.0,
"reward": 0.58984375,
"reward_std": 0.251722514629364,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 17
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.906080019985982e-10,
"advantages/std": 0.5960723757743835,
"advantages/var": 0.3553022771613179,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.0768409818569904,
"grad_norm": 0.19984891954596115,
"learning_rate": 9.994835989784303e-07,
"loss": 0.0,
"num_tokens": 3108867.0,
"reward": 0.6640625,
"reward_std": 0.17833054065704346,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 18
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.859250099670972e-09,
"advantages/std": 0.5960591435432434,
"advantages/var": 0.35528650260150485,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.59375,
"epoch": 0.08110992529348986,
"grad_norm": 0.18501721563546608,
"learning_rate": 9.99421071133689e-07,
"loss": 0.0,
"num_tokens": 3286939.0,
"reward": 0.61328125,
"reward_std": 0.1629534810781479,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 19
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.520419240832458e-09,
"advantages/std": 0.5482913851737976,
"advantages/var": 0.3006234430558017,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.484375,
"epoch": 0.08537886872998933,
"grad_norm": 0.17419719340312353,
"learning_rate": 9.993549731048169e-07,
"loss": 0.0,
"num_tokens": 3454477.0,
"reward": 0.66015625,
"reward_std": 0.13349363207817078,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 20
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.8819951499331522e-09,
"advantages/std": 0.6185739636421204,
"advantages/var": 0.38263374849592324,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.08964781216648879,
"grad_norm": 0.27585718695020495,
"learning_rate": 9.992853053643257e-07,
"loss": -0.0,
"num_tokens": 3623839.0,
"reward": 0.55078125,
"reward_std": 0.19385483860969543,
"rewards/drgrpo_math_reward/mean": 0.55078125,
"rewards/drgrpo_math_reward/std": 0.49838894605636597,
"step": 21
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.691813202266692e-09,
"advantages/std": 0.5726872682571411,
"advantages/var": 0.3279707072238267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.453125,
"epoch": 0.09391675560298826,
"grad_norm": 0.22555380621343457,
"learning_rate": 9.992120684102452e-07,
"loss": -0.0,
"num_tokens": 3800102.0,
"reward": 0.63671875,
"reward_std": 0.16439500451087952,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 22
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.363669808468423e-09,
"advantages/std": 0.6402793526649475,
"advantages/var": 0.4099576494490442,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.09818569903948772,
"grad_norm": 0.255857672328258,
"learning_rate": 9.991352627661204e-07,
"loss": -0.0,
"num_tokens": 3951987.0,
"reward": 0.73046875,
"reward_std": 0.19872015714645386,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 23
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 8.068186637020806e-09,
"advantages/std": 0.5482994318008423,
"advantages/var": 0.3006322669131265,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.359375,
"epoch": 0.10245464247598719,
"grad_norm": 0.21824678675243928,
"learning_rate": 9.990548889810077e-07,
"loss": 0.0,
"num_tokens": 4134083.0,
"reward": 0.61328125,
"reward_std": 0.14256632328033447,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 24
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 2.7326250551152195e-09,
"advantages/std": 0.6816321611404419,
"advantages/var": 0.46462240310098935,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.10672358591248667,
"grad_norm": 0.24711610053127914,
"learning_rate": 9.989709476294707e-07,
"loss": -0.0,
"num_tokens": 4300726.0,
"reward": 0.62890625,
"reward_std": 0.22882908582687378,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 25
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 7.151522306503461e-09,
"advantages/std": 0.618579089641571,
"advantages/var": 0.3826400901417948,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 0.11099252934898612,
"grad_norm": 0.2287106287368787,
"learning_rate": 9.988834393115767e-07,
"loss": 0.0,
"num_tokens": 4475968.0,
"reward": 0.609375,
"reward_std": 0.1990984082221985,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 26
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.225062602237694e-09,
"advantages/std": 0.661284327507019,
"advantages/var": 0.4372969618064104,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.1152614727854856,
"grad_norm": 0.28581877552254065,
"learning_rate": 9.98792364652891e-07,
"loss": 0.0,
"num_tokens": 4639754.0,
"reward": 0.66796875,
"reward_std": 0.22172591090202332,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 27
},
{
"advantages/mean": -6.05359673500061e-09,
"advantages/snr": 9.786420986722844e-09,
"advantages/std": 0.6185710430145264,
"advantages/var": 0.38263013525607903,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 0.11953041622198506,
"grad_norm": 0.24270294378064936,
"learning_rate": 9.986977243044745e-07,
"loss": 0.0,
"num_tokens": 4770384.0,
"reward": 0.796875,
"reward_std": 0.18714365363121033,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 28
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 3.983451559204775e-09,
"advantages/std": 0.7013936638832092,
"advantages/var": 0.4919530717355123,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 0.12379935965848453,
"grad_norm": 0.25235289880629225,
"learning_rate": 9.985995189428775e-07,
"loss": 0.0,
"num_tokens": 4949006.0,
"reward": 0.5703125,
"reward_std": 0.24488136172294617,
"rewards/drgrpo_math_reward/mean": 0.5703125,
"rewards/drgrpo_math_reward/std": 0.4960011839866638,
"step": 29
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.763953668416837e-09,
"advantages/std": 0.6185799837112427,
"advantages/var": 0.38264119624820125,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 0.128068303094984,
"grad_norm": 0.26694114427436527,
"learning_rate": 9.98497749270135e-07,
"loss": -0.0,
"num_tokens": 5103697.0,
"reward": 0.69140625,
"reward_std": 0.19910085201263428,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 30
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5055998919439133e-09,
"advantages/std": 0.6185724139213562,
"advantages/var": 0.3826318312644936,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 0.13233724653148346,
"grad_norm": 0.19746607159558405,
"learning_rate": 9.983924160137624e-07,
"loss": -0.0,
"num_tokens": 5283289.0,
"reward": 0.609375,
"reward_std": 0.191086545586586,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 31
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.520887723343088e-10,
"advantages/std": 0.6612839102745056,
"advantages/var": 0.4372964099879404,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.13660618996798293,
"grad_norm": 0.2770381026875717,
"learning_rate": 9.9828351992675e-07,
"loss": 0.0,
"num_tokens": 5449062.0,
"reward": 0.6640625,
"reward_std": 0.2210792601108551,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 32
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.163983994864379e-09,
"advantages/std": 0.495961457490921,
"advantages/var": 0.24597776731651866,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.14087513340448238,
"grad_norm": 0.22644008687355208,
"learning_rate": 9.981710617875575e-07,
"loss": 0.0,
"num_tokens": 5610253.0,
"reward": 0.6640625,
"reward_std": 0.12164874374866486,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 33
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.687293680438322e-09,
"advantages/std": 0.5960726737976074,
"advantages/var": 0.3553026324482289,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 0.14514407684098185,
"grad_norm": 0.19541733821886728,
"learning_rate": 9.980550424001074e-07,
"loss": 0.0,
"num_tokens": 5800728.0,
"reward": 0.59765625,
"reward_std": 0.17886094748973846,
"rewards/drgrpo_math_reward/mean": 0.59765625,
"rewards/drgrpo_math_reward/std": 0.4913311004638672,
"step": 34
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.812128792816569e-10,
"advantages/std": 0.5960747599601746,
"advantages/var": 0.3553051194615797,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5,
"epoch": 0.14941302027748132,
"grad_norm": 0.24645692096714833,
"learning_rate": 9.97935462593782e-07,
"loss": 0.0,
"num_tokens": 5971332.0,
"reward": 0.58984375,
"reward_std": 0.1822758913040161,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 35
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.272959047519299e-10,
"advantages/std": 0.6402638554573059,
"advantages/var": 0.4099378046050539,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.1536819637139808,
"grad_norm": 0.27255815392539734,
"learning_rate": 9.978123232234146e-07,
"loss": -0.0,
"num_tokens": 6133393.0,
"reward": 0.59375,
"reward_std": 0.17939528822898865,
"rewards/drgrpo_math_reward/mean": 0.59375,
"rewards/drgrpo_math_reward/std": 0.49209436774253845,
"step": 36
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.659004112460079e-09,
"advantages/std": 0.5726901888847351,
"advantages/var": 0.3279740524448336,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.15795090715048027,
"grad_norm": 0.2428885465959258,
"learning_rate": 9.976856251692849e-07,
"loss": -0.0,
"num_tokens": 6306233.0,
"reward": 0.62109375,
"reward_std": 0.16728198528289795,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 37
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.4536934733273464e-10,
"advantages/std": 0.5227810144424438,
"advantages/var": 0.2732999890614707,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.59375,
"epoch": 0.1622198505869797,
"grad_norm": 0.19391049030055343,
"learning_rate": 9.975553693371123e-07,
"loss": 0.0,
"num_tokens": 6472899.0,
"reward": 0.640625,
"reward_std": 0.12756995856761932,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 38
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.140450770645173e-09,
"advantages/std": 0.618564784526825,
"advantages/var": 0.3826223926567174,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 0.1664887940234792,
"grad_norm": 0.2458054922523459,
"learning_rate": 9.974215566580498e-07,
"loss": 0.0,
"num_tokens": 6635464.0,
"reward": 0.69140625,
"reward_std": 0.18136723339557648,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 39
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.48969158764971e-09,
"advantages/std": 0.4675893187522888,
"advantages/var": 0.21863977101122956,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 0.17075773745997866,
"grad_norm": 0.1921600026379778,
"learning_rate": 9.972841880886765e-07,
"loss": 0.0,
"num_tokens": 6805950.0,
"reward": 0.56640625,
"reward_std": 0.10205548256635666,
"rewards/drgrpo_math_reward/mean": 0.56640625,
"rewards/drgrpo_math_reward/std": 0.4965413510799408,
"step": 40
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5483123064041138,
"advantages/var": 0.30064638535419874,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.17502668089647813,
"grad_norm": 0.1966993448153858,
"learning_rate": 9.971432646109917e-07,
"loss": -0.0,
"num_tokens": 6974438.0,
"reward": 0.6953125,
"reward_std": 0.15729428827762604,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 41
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.2268175393589625e-09,
"advantages/std": 0.522787868976593,
"advantages/var": 0.2733071559490874,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.375,
"epoch": 0.17929562433297758,
"grad_norm": 0.17565374090970223,
"learning_rate": 9.969987872324075e-07,
"loss": 0.0,
"num_tokens": 7147608.0,
"reward": 0.6328125,
"reward_std": 0.13599355518817902,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 42
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.1249027630968986e-09,
"advantages/std": 0.5960649847984314,
"advantages/var": 0.35529346610275425,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.46875,
"epoch": 0.18356456776947705,
"grad_norm": 0.20919270586057445,
"learning_rate": 9.968507569857412e-07,
"loss": -0.0,
"num_tokens": 7331808.0,
"reward": 0.578125,
"reward_std": 0.16925784945487976,
"rewards/drgrpo_math_reward/mean": 0.578125,
"rewards/drgrpo_math_reward/std": 0.49482619762420654,
"step": 43
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.439349784680999e-09,
"advantages/std": 0.5726869702339172,
"advantages/var": 0.3279703658757036,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.18783351120597652,
"grad_norm": 0.22710593855397082,
"learning_rate": 9.966991749292086e-07,
"loss": -0.0,
"num_tokens": 7494201.0,
"reward": 0.6328125,
"reward_std": 0.16386458277702332,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 44
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5624291957442206e-09,
"advantages/std": 0.5960734486579895,
"advantages/var": 0.35530355619502885,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.546875,
"epoch": 0.192102454642476,
"grad_norm": 0.1873875778423085,
"learning_rate": 9.965440421464162e-07,
"loss": 0.0,
"num_tokens": 7668016.0,
"reward": 0.68359375,
"reward_std": 0.18003800511360168,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 45
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.672136567095384e-09,
"advantages/std": 0.5227965712547302,
"advantages/var": 0.2733162549157022,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 0.19637139807897544,
"grad_norm": 0.19598414326761007,
"learning_rate": 9.963853597463532e-07,
"loss": -0.0,
"num_tokens": 7831746.0,
"reward": 0.6640625,
"reward_std": 0.14518246054649353,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 46
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.8820033105193726e-09,
"advantages/std": 0.6185712814331055,
"advantages/var": 0.38263043021379417,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.2006403415154749,
"grad_norm": 0.23495355894033243,
"learning_rate": 9.962231288633838e-07,
"loss": 0.0,
"num_tokens": 8005499.0,
"reward": 0.66796875,
"reward_std": 0.1909678727388382,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 47
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.388929790548923e-10,
"advantages/std": 0.49596843123435974,
"advantages/var": 0.24598468478107183,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.484375,
"epoch": 0.20490928495197439,
"grad_norm": 0.17028775334740143,
"learning_rate": 9.960573506572389e-07,
"loss": 0.0,
"num_tokens": 8177309.0,
"reward": 0.58984375,
"reward_std": 0.1312469244003296,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 48
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 5.520271014273453e-09,
"advantages/std": 0.5483061075210571,
"advantages/var": 0.30063958754489306,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.20917822838847386,
"grad_norm": 0.21939679418814842,
"learning_rate": 9.958880263130084e-07,
"loss": -0.0,
"num_tokens": 8334953.0,
"reward": 0.73046875,
"reward_std": 0.14940111339092255,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 49
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.87778765062976e-09,
"advantages/std": 0.4959679841995239,
"advantages/var": 0.24598424135093921,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.21344717182497333,
"grad_norm": 0.17905287281997653,
"learning_rate": 9.957151570411316e-07,
"loss": -0.0,
"num_tokens": 8498503.0,
"reward": 0.6484375,
"reward_std": 0.12901148200035095,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 50
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.7555886158092e-09,
"advantages/std": 0.49596622586250305,
"advantages/var": 0.2459824971962954,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.21771611526147278,
"grad_norm": 0.19002154099000607,
"learning_rate": 9.9553874407739e-07,
"loss": -0.0,
"num_tokens": 8642045.0,
"reward": 0.6484375,
"reward_std": 0.12836240231990814,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 51
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.7342560139901876e-09,
"advantages/std": 0.5960723757743835,
"advantages/var": 0.3553022771613179,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 0.22198505869797225,
"grad_norm": 0.23688167107288094,
"learning_rate": 9.95358788682897e-07,
"loss": -0.0,
"num_tokens": 8833114.0,
"reward": 0.546875,
"reward_std": 0.17833054065704346,
"rewards/drgrpo_math_reward/mean": 0.546875,
"rewards/drgrpo_math_reward/std": 0.4987730085849762,
"step": 52
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 9.858661223793146e-09,
"advantages/std": 0.49595409631729126,
"advantages/var": 0.24597046565390102,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 0.22625400213447172,
"grad_norm": 0.21147731024352381,
"learning_rate": 9.951752921440904e-07,
"loss": 0.0,
"num_tokens": 8968225.0,
"reward": 0.8203125,
"reward_std": 0.11481395363807678,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 53
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.1231940844604465e-09,
"advantages/std": 0.5483027696609497,
"advantages/var": 0.30063592721786847,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.2305229455709712,
"grad_norm": 0.19216694044272573,
"learning_rate": 9.949882557727213e-07,
"loss": -0.0,
"num_tokens": 9128708.0,
"reward": 0.7421875,
"reward_std": 0.1459837108850479,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 54
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.2524406527748425e-09,
"advantages/std": 0.5726915001869202,
"advantages/var": 0.3279755543863452,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.23479188900747064,
"grad_norm": 0.1931123684231175,
"learning_rate": 9.947976809058467e-07,
"loss": -0.0,
"num_tokens": 9278853.0,
"reward": 0.80859375,
"reward_std": 0.16951987147331238,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 55
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.626250454143894e-09,
"advantages/std": 0.5726808905601501,
"advantages/var": 0.32796340241276667,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.515625,
"epoch": 0.2390608324439701,
"grad_norm": 0.22436070190282476,
"learning_rate": 9.946035689058187e-07,
"loss": 0.0,
"num_tokens": 9437868.0,
"reward": 0.69921875,
"reward_std": 0.15756022930145264,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 56
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.14039371842048e-09,
"advantages/std": 0.6185733079910278,
"advantages/var": 0.382632937358963,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.375,
"epoch": 0.24332977588046958,
"grad_norm": 0.22106574157254638,
"learning_rate": 9.94405921160275e-07,
"loss": -0.0,
"num_tokens": 9624456.0,
"reward": 0.50390625,
"reward_std": 0.19108900427818298,
"rewards/drgrpo_math_reward/mean": 0.50390625,
"rewards/drgrpo_math_reward/std": 0.5009641647338867,
"step": 57
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.687265558082829e-09,
"advantages/std": 0.596076250076294,
"advantages/var": 0.3553068959050165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.546875,
"epoch": 0.24759871931696906,
"grad_norm": 0.21659403501122557,
"learning_rate": 9.942047390821295e-07,
"loss": 0.0,
"num_tokens": 9797629.0,
"reward": 0.66015625,
"reward_std": 0.18292498588562012,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 58
},
{
"advantages/mean": 4.190951585769653e-09,
"advantages/snr": 7.317995276958649e-09,
"advantages/std": 0.5726912021636963,
"advantages/var": 0.32797521303569965,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.2518676627534685,
"grad_norm": 0.21183684386735038,
"learning_rate": 9.940000241095616e-07,
"loss": -0.0,
"num_tokens": 9959727.0,
"reward": 0.6640625,
"reward_std": 0.16898946464061737,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 59
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196689260745064e-09,
"advantages/std": 0.5726897716522217,
"advantages/var": 0.3279735745550738,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 0.256136606189968,
"grad_norm": 0.2038191691018717,
"learning_rate": 9.937917777060056e-07,
"loss": 0.0,
"num_tokens": 10104572.0,
"reward": 0.75,
"reward_std": 0.16834037005901337,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 60
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.9060964248770786e-10,
"advantages/std": 0.596069872379303,
"advantages/var": 0.35529929275827854,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.26040554962646745,
"grad_norm": 0.26121318948558775,
"learning_rate": 9.935800013601413e-07,
"loss": 0.0,
"num_tokens": 10247777.0,
"reward": 0.78515625,
"reward_std": 0.17597398161888123,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 61
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.4083615633936797e-09,
"advantages/std": 0.6612808704376221,
"advantages/var": 0.4372923896067391,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.2646744930629669,
"grad_norm": 0.26063853689599825,
"learning_rate": 9.93364696585883e-07,
"loss": -0.0,
"num_tokens": 10425033.0,
"reward": 0.6015625,
"reward_std": 0.21713145077228546,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"step": 62
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.125766567692018e-09,
"advantages/std": 0.5227915048599243,
"advantages/var": 0.27331095755370427,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.2689434364994664,
"grad_norm": 0.2080920271905109,
"learning_rate": 9.931458649223683e-07,
"loss": 0.0,
"num_tokens": 10576767.0,
"reward": 0.69140625,
"reward_std": 0.13782215118408203,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 63
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.516794888289669e-09,
"advantages/std": 0.6185730695724487,
"advantages/var": 0.3826326424002815,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.27321237993596587,
"grad_norm": 0.2266001825573351,
"learning_rate": 9.929235079339465e-07,
"loss": 0.0,
"num_tokens": 10730985.0,
"reward": 0.6328125,
"reward_std": 0.19055859744548798,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 64
},
{
"advantages/mean": -6.984919309616089e-09,
"advantages/snr": 1.273877537771054e-08,
"advantages/std": 0.5483195185661316,
"advantages/var": 0.3006542944405943,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 0.27748132337246534,
"grad_norm": 0.2285948803926055,
"learning_rate": 9.926976272101692e-07,
"loss": 0.0,
"num_tokens": 10884864.0,
"reward": 0.73046875,
"reward_std": 0.16477571427822113,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 65
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.4959557056427002,
"advantages/var": 0.24597206195954868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.28175026680896476,
"grad_norm": 0.1844509849519164,
"learning_rate": 9.924682243657778e-07,
"loss": 0.0,
"num_tokens": 11025620.0,
"reward": 0.7265625,
"reward_std": 0.11705183982849121,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 66
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.8819951499331522e-09,
"advantages/std": 0.6185739636421204,
"advantages/var": 0.38263374849592324,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.453125,
"epoch": 0.28601921024546423,
"grad_norm": 0.24663370784559255,
"learning_rate": 9.922353010406917e-07,
"loss": -0.0,
"num_tokens": 11203762.0,
"reward": 0.58203125,
"reward_std": 0.19385485351085663,
"rewards/drgrpo_math_reward/mean": 0.58203125,
"rewards/drgrpo_math_reward/std": 0.49419113993644714,
"step": 67
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.812152228159681e-10,
"advantages/std": 0.5960729718208313,
"advantages/var": 0.35530298773531754,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.4375,
"epoch": 0.2902881536819637,
"grad_norm": 0.20863606400263202,
"learning_rate": 9.91998858899997e-07,
"loss": -0.0,
"num_tokens": 11390686.0,
"reward": 0.6328125,
"reward_std": 0.17939136922359467,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 68
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 1.0615803089720003e-08,
"advantages/std": 0.5483114123344421,
"advantages/var": 0.3006454048961906,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 0.2945570971184632,
"grad_norm": 0.21968065629436126,
"learning_rate": 9.91758899633935e-07,
"loss": 0.0,
"num_tokens": 11532669.0,
"reward": 0.78515625,
"reward_std": 0.15570303797721863,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 69
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.3436386378449705e-09,
"advantages/std": 0.5960747599601746,
"advantages/var": 0.3553051194615797,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 0.29882604055496265,
"grad_norm": 0.22408499383948063,
"learning_rate": 9.915154249578892e-07,
"loss": -0.0,
"num_tokens": 11694199.0,
"reward": 0.73046875,
"reward_std": 0.1822758913040161,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 70
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.246355856158812e-09,
"advantages/std": 0.548306941986084,
"advantages/var": 0.30064050263013087,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 0.3030949839914621,
"grad_norm": 0.23142067685000983,
"learning_rate": 9.91268436612374e-07,
"loss": 0.0,
"num_tokens": 11840347.0,
"reward": 0.66796875,
"reward_std": 0.15057815611362457,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 71
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.3436733225620384e-09,
"advantages/std": 0.5960659384727478,
"advantages/var": 0.35529460300739757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.3073639274279616,
"grad_norm": 0.23038625562724183,
"learning_rate": 9.91017936363021e-07,
"loss": 0.0,
"num_tokens": 11972459.0,
"reward": 0.82421875,
"reward_std": 0.16926030814647675,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"step": 72
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.46758967638015747,
"advantages/var": 0.2186401054573004,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.31163287086446106,
"grad_norm": 0.176058018727384,
"learning_rate": 9.90763926000568e-07,
"loss": -0.0,
"num_tokens": 12133622.0,
"reward": 0.6640625,
"reward_std": 0.10258588939905167,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 73
},
{
"advantages/mean": 3.259629011154175e-09,
"advantages/snr": 5.468471563051712e-09,
"advantages/std": 0.5960767865180969,
"advantages/var": 0.3553075354257409,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.46875,
"epoch": 0.31590181430096054,
"grad_norm": 0.21721921107191133,
"learning_rate": 9.90506407340845e-07,
"loss": 0.0,
"num_tokens": 12312108.0,
"reward": 0.48828125,
"reward_std": 0.18398582935333252,
"rewards/drgrpo_math_reward/mean": 0.48828125,
"rewards/drgrpo_math_reward/std": 0.5008418560028076,
"step": 74
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.6363627332899125e-10,
"advantages/std": 0.6402844190597534,
"advantages/var": 0.4099641372906859,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.32017075773745995,
"grad_norm": 0.2705578088558078,
"learning_rate": 9.902453822247614e-07,
"loss": -0.0,
"num_tokens": 12496457.0,
"reward": 0.66796875,
"reward_std": 0.20608291029930115,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 75
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.33610210092994e-09,
"advantages/std": 0.5227833390235901,
"advantages/var": 0.27330241956065393,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.421875,
"epoch": 0.3244397011739594,
"grad_norm": 0.21517258013047688,
"learning_rate": 9.899808525182934e-07,
"loss": -0.0,
"num_tokens": 12676832.0,
"reward": 0.546875,
"reward_std": 0.12927989661693573,
"rewards/drgrpo_math_reward/mean": 0.546875,
"rewards/drgrpo_math_reward/std": 0.4987730085849762,
"step": 76
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.272772181567429e-10,
"advantages/std": 0.6402803063392639,
"advantages/var": 0.40995887068590164,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 0.3287086446104589,
"grad_norm": 0.22787543124127765,
"learning_rate": 9.897128201124698e-07,
"loss": -0.0,
"num_tokens": 12856929.0,
"reward": 0.5859375,
"reward_std": 0.20042762160301208,
"rewards/drgrpo_math_reward/mean": 0.5859375,
"rewards/drgrpo_math_reward/std": 0.4935242533683777,
"step": 77
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 8.917246484022717e-09,
"advantages/std": 0.5483131408691406,
"advantages/var": 0.30064730044978205,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.515625,
"epoch": 0.3329775880469584,
"grad_norm": 0.21594927536319913,
"learning_rate": 9.894412869233596e-07,
"loss": -0.0,
"num_tokens": 13041959.0,
"reward": 0.609375,
"reward_std": 0.15847134590148926,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 78
},
{
"advantages/mean": -6.51925802230835e-09,
"advantages/snr": 1.0539255098620102e-08,
"advantages/std": 0.6185691356658936,
"advantages/var": 0.3826277755984506,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.33724653148345785,
"grad_norm": 0.1842527654890239,
"learning_rate": 9.89166254892057e-07,
"loss": 0.0,
"num_tokens": 13212105.0,
"reward": 0.7421875,
"reward_std": 0.18543373048305511,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 79
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.098397946146296e-09,
"advantages/std": 0.5726847648620605,
"advantages/var": 0.3279678399051136,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.3415154749199573,
"grad_norm": 0.18973959019347075,
"learning_rate": 9.888877259846684e-07,
"loss": 0.0,
"num_tokens": 13372523.0,
"reward": 0.7265625,
"reward_std": 0.16044965386390686,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 80
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 7.904410095893117e-09,
"advantages/std": 0.6185715794563293,
"advantages/var": 0.38263079891109797,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.234375,
"epoch": 0.3457844183564568,
"grad_norm": 0.24357128545071005,
"learning_rate": 9.886057021922982e-07,
"loss": 0.0,
"num_tokens": 13556828.0,
"reward": 0.59375,
"reward_std": 0.1914982795715332,
"rewards/drgrpo_math_reward/mean": 0.59375,
"rewards/drgrpo_math_reward/std": 0.49209436774253845,
"step": 81
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 7.030935599219189e-09,
"advantages/std": 0.5960730910301208,
"advantages/var": 0.35530312985020274,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.35005336179295626,
"grad_norm": 0.20902800614565434,
"learning_rate": 9.883201855310348e-07,
"loss": 0.0,
"num_tokens": 13716324.0,
"reward": 0.6875,
"reward_std": 0.17780256271362305,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 82
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.5454588894424267e-09,
"advantages/std": 0.6402831673622131,
"advantages/var": 0.40996253440738784,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.35432230522945574,
"grad_norm": 0.2840754464428357,
"learning_rate": 9.880311780419353e-07,
"loss": 0.0,
"num_tokens": 13871107.0,
"reward": 0.75390625,
"reward_std": 0.20384500920772552,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 83
},
{
"advantages/mean": 4.190951585769653e-09,
"advantages/snr": 5.975091018752593e-09,
"advantages/std": 0.701403796672821,
"advantages/var": 0.4919672859870481,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.35859124866595515,
"grad_norm": 0.2649071214396161,
"learning_rate": 9.877386817910116e-07,
"loss": 0.0,
"num_tokens": 14054447.0,
"reward": 0.58984375,
"reward_std": 0.2580205500125885,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 84
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.5628885646149507e-09,
"advantages/std": 0.5227907299995422,
"advantages/var": 0.27331014737345427,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.3628601921024546,
"grad_norm": 0.19237083737953337,
"learning_rate": 9.874426988692163e-07,
"loss": 0.0,
"num_tokens": 14221007.0,
"reward": 0.80859375,
"reward_std": 0.13835012912750244,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 85
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.011203265744322e-09,
"advantages/std": 0.6185716986656189,
"advantages/var": 0.3826309463900692,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 0.3671291355389541,
"grad_norm": 0.2315622924620685,
"learning_rate": 9.871432313924253e-07,
"loss": -0.0,
"num_tokens": 14385112.0,
"reward": 0.671875,
"reward_std": 0.18990950286388397,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 86
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.7789658227491635e-09,
"advantages/std": 0.36967357993125916,
"advantages/var": 0.13665855569919305,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 0.37139807897545357,
"grad_norm": 0.15106295394289238,
"learning_rate": 9.868402815014265e-07,
"loss": 0.0,
"num_tokens": 14535844.0,
"reward": 0.65234375,
"reward_std": 0.07232724130153656,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 87
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.694495078714221e-10,
"advantages/std": 0.49596524238586426,
"advantages/var": 0.24598152165486908,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.37566702241195304,
"grad_norm": 0.1867263412626549,
"learning_rate": 9.865338513619004e-07,
"loss": -0.0,
"num_tokens": 14695433.0,
"reward": 0.69140625,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 88
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 2.9090742089424973e-09,
"advantages/std": 0.6402879357337952,
"advantages/var": 0.4099686406462446,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.3799359658484525,
"grad_norm": 0.2450922826903169,
"learning_rate": 9.86223943164408e-07,
"loss": 0.0,
"num_tokens": 14857936.0,
"reward": 0.6796875,
"reward_std": 0.2106773555278778,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 89
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5726829767227173,
"advantages/var": 0.32796579182799235,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.390625,
"epoch": 0.384204909284952,
"grad_norm": 0.2024341681897372,
"learning_rate": 9.859105591243726e-07,
"loss": -0.0,
"num_tokens": 15042816.0,
"reward": 0.62109375,
"reward_std": 0.15927013754844666,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 90
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.285282070613587e-09,
"advantages/std": 0.5726843476295471,
"advantages/var": 0.32796736201987997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.38847385272145146,
"grad_norm": 0.21879951538884707,
"learning_rate": 9.85593701482066e-07,
"loss": 0.0,
"num_tokens": 15198870.0,
"reward": 0.66796875,
"reward_std": 0.16150802373886108,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 91
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.6597939099176911e-09,
"advantages/std": 0.7013841867446899,
"advantages/var": 0.4919397774155101,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.3927427961579509,
"grad_norm": 0.32495948531780694,
"learning_rate": 9.852733725025907e-07,
"loss": 0.0,
"num_tokens": 15351762.0,
"reward": 0.65234375,
"reward_std": 0.22962543368339539,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 92
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.0082834527591325e-09,
"advantages/std": 0.5227863192558289,
"advantages/var": 0.2733055356010574,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.39701173959445035,
"grad_norm": 0.24303979574096995,
"learning_rate": 9.849495744758654e-07,
"loss": 0.0,
"num_tokens": 15500943.0,
"reward": 0.6953125,
"reward_std": 0.1337556540966034,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 93
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.408336252804626e-09,
"advantages/std": 0.4959695637226105,
"advantages/var": 0.24598580813919657,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.4012806830309498,
"grad_norm": 0.18723812086741365,
"learning_rate": 9.84622309716607e-07,
"loss": -0.0,
"num_tokens": 15646996.0,
"reward": 0.7265625,
"reward_std": 0.13124938309192657,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 94
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.008182001769061e-09,
"advantages/std": 0.522799551486969,
"advantages/var": 0.27331937103497594,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.4055496264674493,
"grad_norm": 0.22017340183627318,
"learning_rate": 9.842915805643156e-07,
"loss": 0.0,
"num_tokens": 15808802.0,
"reward": 0.6796875,
"reward_std": 0.14795321226119995,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 95
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 7.724561021577794e-09,
"advantages/std": 0.5726904273033142,
"advantages/var": 0.3279743255248526,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 0.40981856990394877,
"grad_norm": 0.2742430048994587,
"learning_rate": 9.839573893832563e-07,
"loss": 0.0,
"num_tokens": 15962018.0,
"reward": 0.65625,
"reward_std": 0.16781240701675415,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 96
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.285112648635477e-09,
"advantages/std": 0.5727027058601379,
"advantages/var": 0.3279883892995237,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 0.41408751334044824,
"grad_norm": 0.25942822428286266,
"learning_rate": 9.836197385624432e-07,
"loss": 0.0,
"num_tokens": 16118197.0,
"reward": 0.6484375,
"reward_std": 0.18212617933750153,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 97
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.131044932268215e-10,
"advantages/std": 0.5726954936981201,
"advantages/var": 0.32798012850213354,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 0.4183564567769477,
"grad_norm": 0.21659055119101517,
"learning_rate": 9.832786305156228e-07,
"loss": 0.0,
"num_tokens": 16274190.0,
"reward": 0.6953125,
"reward_std": 0.17411433160305023,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 98
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.246282000263511e-10,
"advantages/std": 0.548316478729248,
"advantages/var": 0.3006509608460419,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 0.4226254002134472,
"grad_norm": 0.2244429283719818,
"learning_rate": 9.829340676812552e-07,
"loss": -0.0,
"num_tokens": 16417922.0,
"reward": 0.69140625,
"reward_std": 0.1618887335062027,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 99
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.8819860826981316e-09,
"advantages/std": 0.6185769438743591,
"advantages/var": 0.38263743549294205,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 0.42689434364994666,
"grad_norm": 0.2245456822690171,
"learning_rate": 9.825860525224981e-07,
"loss": -0.0,
"num_tokens": 16570727.0,
"reward": 0.7890625,
"reward_std": 0.19727224111557007,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 100
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 4.065590591029637e-09,
"advantages/std": 0.5726858973503113,
"advantages/var": 0.32796913702393127,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.4311632870864461,
"grad_norm": 0.24390358364587256,
"learning_rate": 9.822345875271883e-07,
"loss": 0.0,
"num_tokens": 16718926.0,
"reward": 0.76171875,
"reward_std": 0.1621571183204651,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 101
},
{
"advantages/mean": 3.4924596548080444e-09,
"advantages/snr": 5.281322540434018e-09,
"advantages/std": 0.6612850427627563,
"advantages/var": 0.4372979077817405,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 0.43543223052294555,
"grad_norm": 0.27030435312221507,
"learning_rate": 9.818796752078243e-07,
"loss": -0.0,
"num_tokens": 16898303.0,
"reward": 0.55859375,
"reward_std": 0.22290295362472534,
"rewards/drgrpo_math_reward/mean": 0.55859375,
"rewards/drgrpo_math_reward/std": 0.4975275993347168,
"step": 102
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.6985576679666268e-09,
"advantages/std": 0.5483019948005676,
"advantages/var": 0.3006350775022817,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 0.439701173959445,
"grad_norm": 0.1900534271188019,
"learning_rate": 9.815213181015487e-07,
"loss": 0.0,
"num_tokens": 17065727.0,
"reward": 0.7265625,
"reward_std": 0.14651167392730713,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 103
},
{
"advantages/mean": -6.28642737865448e-09,
"advantages/snr": 1.0546620649499414e-08,
"advantages/std": 0.5960608124732971,
"advantages/var": 0.3552884921663271,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.4439701173959445,
"grad_norm": 0.28215753816918543,
"learning_rate": 9.811595187701293e-07,
"loss": 0.0,
"num_tokens": 17218805.0,
"reward": 0.6875,
"reward_std": 0.1641329973936081,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 104
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.5155324825148615e-09,
"advantages/std": 0.5960621237754822,
"advantages/var": 0.35529005539973824,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 0.44823906083244397,
"grad_norm": 0.24532268661495166,
"learning_rate": 9.807942797999412e-07,
"loss": -0.0,
"num_tokens": 17382522.0,
"reward": 0.5546875,
"reward_std": 0.16637088358402252,
"rewards/drgrpo_math_reward/mean": 0.5546875,
"rewards/drgrpo_math_reward/std": 0.49797385931015015,
"step": 105
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.659041433501714e-09,
"advantages/std": 0.5726843476295471,
"advantages/var": 0.32796736201987997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.45250800426894344,
"grad_norm": 0.251796558756451,
"learning_rate": 9.804256038019481e-07,
"loss": 0.0,
"num_tokens": 17534509.0,
"reward": 0.79296875,
"reward_std": 0.16150803864002228,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 106
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 6.8315477033413135e-09,
"advantages/std": 0.6816336512565613,
"advantages/var": 0.4646244345253514,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 0.4567769477054429,
"grad_norm": 0.22918055149473104,
"learning_rate": 9.800534934116842e-07,
"loss": 0.0,
"num_tokens": 17707238.0,
"reward": 0.6953125,
"reward_std": 0.23330241441726685,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 107
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.94494343648925e-09,
"advantages/std": 0.5483027696609497,
"advantages/var": 0.30063592721786847,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 0.4610458911419424,
"grad_norm": 0.20682523205492293,
"learning_rate": 9.796779512892345e-07,
"loss": -0.0,
"num_tokens": 17877082.0,
"reward": 0.6953125,
"reward_std": 0.1459837108850479,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 108
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983506540239536e-09,
"advantages/std": 0.4675893187522888,
"advantages/var": 0.21863977101122956,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.46531483457844186,
"grad_norm": 0.20803318441731042,
"learning_rate": 9.792989801192167e-07,
"loss": -0.0,
"num_tokens": 18029785.0,
"reward": 0.76171875,
"reward_std": 0.10205547511577606,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 109
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.572338921941043e-09,
"advantages/std": 0.4959617853164673,
"advantages/var": 0.24597809249429758,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.4695837780149413,
"grad_norm": 0.1849990700457602,
"learning_rate": 9.78916582610761e-07,
"loss": -0.0,
"num_tokens": 18185127.0,
"reward": 0.73046875,
"reward_std": 0.12217915058135986,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 110
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.2584118068471646e-09,
"advantages/std": 0.6185691356658936,
"advantages/var": 0.3826277755984506,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5,
"epoch": 0.47385272145144075,
"grad_norm": 0.2517914798778077,
"learning_rate": 9.78530761497492e-07,
"loss": -0.0,
"num_tokens": 18350297.0,
"reward": 0.5859375,
"reward_std": 0.18543371558189392,
"rewards/drgrpo_math_reward/mean": 0.5859375,
"rewards/drgrpo_math_reward/std": 0.4935242533683777,
"step": 111
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 6.831425840696212e-10,
"advantages/std": 0.6816458106040955,
"advantages/var": 0.4646410111141144,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.4781216648879402,
"grad_norm": 0.24095208318789144,
"learning_rate": 9.781415195375076e-07,
"loss": -0.0,
"num_tokens": 18528245.0,
"reward": 0.57421875,
"reward_std": 0.24974030256271362,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 112
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.979300026668281e-10,
"advantages/std": 0.4675971269607544,
"advantages/var": 0.21864707314195186,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.4823906083244397,
"grad_norm": 0.1594297751631633,
"learning_rate": 9.77748859513361e-07,
"loss": -0.0,
"num_tokens": 18687978.0,
"reward": 0.68359375,
"reward_std": 0.10889026522636414,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 113
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.3436505898950562e-09,
"advantages/std": 0.596071720123291,
"advantages/var": 0.355301495530739,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 0.48665955176093917,
"grad_norm": 0.20941137193092302,
"learning_rate": 9.77352784232039e-07,
"loss": 0.0,
"num_tokens": 18869022.0,
"reward": 0.6171875,
"reward_std": 0.17885850369930267,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 114
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 8.537693145596392e-09,
"advantages/std": 0.5726890563964844,
"advantages/var": 0.32797275531629566,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.49092849519743864,
"grad_norm": 0.25209405893787906,
"learning_rate": 9.769532965249435e-07,
"loss": 0.0,
"num_tokens": 19019613.0,
"reward": 0.7265625,
"reward_std": 0.16557452082633972,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 115
},
{
"advantages/mean": 5.122274160385132e-09,
"advantages/snr": 7.74590481727937e-09,
"advantages/std": 0.6612880229949951,
"advantages/var": 0.4373018493566292,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.40625,
"epoch": 0.4951974386339381,
"grad_norm": 0.2484135721878627,
"learning_rate": 9.765503992478703e-07,
"loss": -0.0,
"num_tokens": 19201913.0,
"reward": 0.56640625,
"reward_std": 0.22685076296329498,
"rewards/drgrpo_math_reward/mean": 0.56640625,
"rewards/drgrpo_math_reward/std": 0.4965413510799408,
"step": 116
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.5050695213580615e-09,
"advantages/std": 0.5726749300956726,
"advantages/var": 0.3279565755600835,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.4994663820704376,
"grad_norm": 0.20850762851886148,
"learning_rate": 9.761440952809897e-07,
"loss": 0.0,
"num_tokens": 19370768.0,
"reward": 0.71875,
"reward_std": 0.14966705441474915,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 117
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.1718284587501684e-09,
"advantages/std": 0.5960701107978821,
"advantages/var": 0.3552995769865994,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.234375,
"epoch": 0.503735325506937,
"grad_norm": 0.19440456705898,
"learning_rate": 9.75734387528824e-07,
"loss": -0.0,
"num_tokens": 19559213.0,
"reward": 0.58984375,
"reward_std": 0.17609019577503204,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 118
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 6.899591307565781e-09,
"advantages/std": 0.40494683384895325,
"advantages/var": 0.16398193824429175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.5080042689434365,
"grad_norm": 0.1625977913542358,
"learning_rate": 9.75321278920229e-07,
"loss": 0.0,
"num_tokens": 19693612.0,
"reward": 0.8203125,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 119
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 2.6556072918866842e-09,
"advantages/std": 0.7014008164405823,
"advantages/var": 0.4919631053035154,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.40625,
"epoch": 0.512273212379936,
"grad_norm": 0.27257864175109514,
"learning_rate": 9.749047724083714e-07,
"loss": -0.0,
"num_tokens": 19880428.0,
"reward": 0.55078125,
"reward_std": 0.25395649671554565,
"rewards/drgrpo_math_reward/mean": 0.55078125,
"rewards/drgrpo_math_reward/std": 0.49838894605636597,
"step": 120
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.520307620092389e-09,
"advantages/std": 0.5483024716377258,
"advantages/var": 0.30063560040403914,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.5165421558164355,
"grad_norm": 0.23162261371168508,
"learning_rate": 9.74484870970709e-07,
"loss": -0.0,
"num_tokens": 20019486.0,
"reward": 0.75390625,
"reward_std": 0.1454533040523529,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 121
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.5478002194722765e-09,
"advantages/std": 0.5483098030090332,
"advantages/var": 0.3006436400758048,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.40625,
"epoch": 0.5208110992529349,
"grad_norm": 0.22311043728478686,
"learning_rate": 9.74061577608968e-07,
"loss": 0.0,
"num_tokens": 20199121.0,
"reward": 0.58203125,
"reward_std": 0.15505394339561462,
"rewards/drgrpo_math_reward/mean": 0.58203125,
"rewards/drgrpo_math_reward/std": 0.49419113993644714,
"step": 122
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.369594716994514e-09,
"advantages/std": 0.5483016967773438,
"advantages/var": 0.3006347506889142,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.25,
"epoch": 0.5250800426894343,
"grad_norm": 0.25070328563799926,
"learning_rate": 9.736348953491221e-07,
"loss": 0.0,
"num_tokens": 20377999.0,
"reward": 0.57421875,
"reward_std": 0.14598125219345093,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 123
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 7.469034298941239e-09,
"advantages/std": 0.4675918519496918,
"advantages/var": 0.21864214000974247,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.484375,
"epoch": 0.5293489861259338,
"grad_norm": 0.18374980419818973,
"learning_rate": 9.732048272413725e-07,
"loss": -0.0,
"num_tokens": 20549287.0,
"reward": 0.65234375,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 124
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.468483593103085e-09,
"advantages/std": 0.5960754752159119,
"advantages/var": 0.35530597215387516,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 0.5336179295624333,
"grad_norm": 0.25946092115826264,
"learning_rate": 9.727713763601226e-07,
"loss": 0.0,
"num_tokens": 20713814.0,
"reward": 0.64453125,
"reward_std": 0.1817479431629181,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 125
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 2.987551856360924e-09,
"advantages/std": 0.7014023065567017,
"advantages/var": 0.4919651956430613,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 0.5378868729989328,
"grad_norm": 0.2839156360779615,
"learning_rate": 9.723345458039593e-07,
"loss": 0.0,
"num_tokens": 20890843.0,
"reward": 0.6015625,
"reward_std": 0.25513601303100586,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"step": 126
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.226836580992585e-09,
"advantages/std": 0.5227833986282349,
"advantages/var": 0.2733024818812879,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.5421558164354322,
"grad_norm": 0.24753063560244626,
"learning_rate": 9.718943386956296e-07,
"loss": -0.0,
"num_tokens": 21043874.0,
"reward": 0.7109375,
"reward_std": 0.13098491728305817,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 127
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.281328252797118e-09,
"advantages/std": 0.661284327507019,
"advantages/var": 0.4372969618064104,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.5464247598719317,
"grad_norm": 0.27079609566069784,
"learning_rate": 9.714507581820179e-07,
"loss": 0.0,
"num_tokens": 21215949.0,
"reward": 0.60546875,
"reward_std": 0.22172591090202332,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 128
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196727343226576e-09,
"advantages/std": 0.5726879835128784,
"advantages/var": 0.3279715264600469,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.5506937033084311,
"grad_norm": 0.23678676576756394,
"learning_rate": 9.71003807434124e-07,
"loss": -0.0,
"num_tokens": 21390672.0,
"reward": 0.62109375,
"reward_std": 0.1638670563697815,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 129
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.1291945873965149e-09,
"advantages/std": 0.6185753345489502,
"advantages/var": 0.38263544451234566,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.5549626467449307,
"grad_norm": 0.24438298608082065,
"learning_rate": 9.7055348964704e-07,
"loss": -0.0,
"num_tokens": 21549093.0,
"reward": 0.66796875,
"reward_std": 0.19450394809246063,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 130
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.9916779936341355e-09,
"advantages/std": 0.4676069915294647,
"advantages/var": 0.2186562985272369,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.5592315901814301,
"grad_norm": 0.17529735964744536,
"learning_rate": 9.700998080399285e-07,
"loss": -0.0,
"num_tokens": 21712261.0,
"reward": 0.671875,
"reward_std": 0.11849337071180344,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 131
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.2524223734538706e-09,
"advantages/std": 0.572694718837738,
"advantages/var": 0.3279792409846358,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 0.5635005336179295,
"grad_norm": 0.2084611382117928,
"learning_rate": 9.696427658559982e-07,
"loss": 0.0,
"num_tokens": 21881266.0,
"reward": 0.640625,
"reward_std": 0.172937273979187,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 132
},
{
"advantages/mean": -5.3551048040390015e-09,
"advantages/snr": 9.350906047837328e-09,
"advantages/std": 0.5726829767227173,
"advantages/var": 0.32796579182799235,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.59375,
"epoch": 0.567769477054429,
"grad_norm": 0.22218752716256074,
"learning_rate": 9.691823663624816e-07,
"loss": 0.0,
"num_tokens": 22042837.0,
"reward": 0.75390625,
"reward_std": 0.15927013754844666,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 133
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.226790627738806e-09,
"advantages/std": 0.5227941870689392,
"advantages/var": 0.273313762033073,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.5720384204909285,
"grad_norm": 0.22175051939206694,
"learning_rate": 9.687186128506113e-07,
"loss": 0.0,
"num_tokens": 22214689.0,
"reward": 0.6328125,
"reward_std": 0.14176751673221588,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 134
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.258423993525738e-09,
"advantages/std": 0.6185657978057861,
"advantages/var": 0.3826236462151087,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5,
"epoch": 0.576307363927428,
"grad_norm": 0.23234493551308205,
"learning_rate": 9.682515086355972e-07,
"loss": -0.0,
"num_tokens": 22391548.0,
"reward": 0.609375,
"reward_std": 0.1830746978521347,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 135
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.906095253094573e-09,
"advantages/std": 0.5960700511932373,
"advantages/var": 0.35529950592950854,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 0.5805763073639274,
"grad_norm": 0.24149835642056233,
"learning_rate": 9.67781057056601e-07,
"loss": 0.0,
"num_tokens": 22569499.0,
"reward": 0.62890625,
"reward_std": 0.1743851751089096,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 136
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.571054469954958e-09,
"advantages/std": 0.5227965116500854,
"advantages/var": 0.27331619259349793,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.5848452508004269,
"grad_norm": 0.22422672779414102,
"learning_rate": 9.673072614767146e-07,
"loss": 0.0,
"num_tokens": 22711127.0,
"reward": 0.7421875,
"reward_std": 0.1434774398803711,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 137
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 3.636395230754275e-10,
"advantages/std": 0.640278697013855,
"advantages/var": 0.4099568098497599,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 0.5891141942369263,
"grad_norm": 0.25984251362548044,
"learning_rate": 9.668301252829343e-07,
"loss": 0.0,
"num_tokens": 22891737.0,
"reward": 0.66015625,
"reward_std": 0.19924810528755188,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 138
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.5723713064881815e-09,
"advantages/std": 0.4959593415260315,
"advantages/var": 0.24597566844693475,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 0.5933831376734259,
"grad_norm": 0.2247332333432713,
"learning_rate": 9.66349651886138e-07,
"loss": 0.0,
"num_tokens": 23038921.0,
"reward": 0.76171875,
"reward_std": 0.12046922743320465,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 139
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 7.812210035940531e-09,
"advantages/std": 0.5960685610771179,
"advantages/var": 0.35529772950454586,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 0.5976520811099253,
"grad_norm": 0.2377338133628425,
"learning_rate": 9.658658447210594e-07,
"loss": 0.0,
"num_tokens": 23200038.0,
"reward": 0.68359375,
"reward_std": 0.1737360954284668,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 140
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.3663241168119124e-09,
"advantages/std": 0.6816263794898987,
"advantages/var": 0.46461452121650737,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 0.6019210245464247,
"grad_norm": 0.2712603744994714,
"learning_rate": 9.653787072462643e-07,
"loss": 0.0,
"num_tokens": 23383317.0,
"reward": 0.5625,
"reward_std": 0.2213476598262787,
"rewards/drgrpo_math_reward/mean": 0.5625,
"rewards/drgrpo_math_reward/std": 0.49705013632774353,
"step": 141
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.49595075845718384,
"advantages/var": 0.2459671548142559,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 0.6061899679829242,
"grad_norm": 0.17658537259355056,
"learning_rate": 9.648882429441256e-07,
"loss": -0.0,
"num_tokens": 23547586.0,
"reward": 0.578125,
"reward_std": 0.11192697286605835,
"rewards/drgrpo_math_reward/mean": 0.578125,
"rewards/drgrpo_math_reward/std": 0.49482619762420654,
"step": 142
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.96696941108981e-09,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.6104589114194237,
"grad_norm": 0.1844179462576979,
"learning_rate": 9.64394455320799e-07,
"loss": 0.0,
"num_tokens": 23701368.0,
"reward": 0.69921875,
"reward_std": 0.10376539826393127,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 143
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.5209299319642286e-10,
"advantages/std": 0.6612759828567505,
"advantages/var": 0.43728592550316137,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.6147278548559232,
"grad_norm": 0.25430397247621467,
"learning_rate": 9.63897347906197e-07,
"loss": -0.0,
"num_tokens": 23881005.0,
"reward": 0.671875,
"reward_std": 0.20976868271827698,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 144
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.505114884350798e-09,
"advantages/std": 0.5726709365844727,
"advantages/var": 0.3279520016085371,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 0.6189967982924226,
"grad_norm": 0.31751555496280937,
"learning_rate": 9.633969242539642e-07,
"loss": 0.0,
"num_tokens": 24039875.0,
"reward": 0.66015625,
"reward_std": 0.14507260918617249,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 145
},
{
"advantages/mean": -5.587935447692871e-09,
"advantages/snr": 1.0688690066716281e-08,
"advantages/std": 0.5227895379066467,
"advantages/var": 0.2733089009446452,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.6232657417289221,
"grad_norm": 0.21736187090374912,
"learning_rate": 9.628931879414516e-07,
"loss": 0.0,
"num_tokens": 24193969.0,
"reward": 0.65625,
"reward_std": 0.13664264976978302,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 146
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.7814402201807623e-09,
"advantages/std": 0.5227919220924377,
"advantages/var": 0.2733113938051055,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5,
"epoch": 0.6275346851654215,
"grad_norm": 0.221860019654238,
"learning_rate": 9.623861425696917e-07,
"loss": -0.0,
"num_tokens": 24361745.0,
"reward": 0.6484375,
"reward_std": 0.14005759358406067,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 147
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.077846141054734e-09,
"advantages/std": 0.5960791707038879,
"advantages/var": 0.3553103777470348,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.6318036286019211,
"grad_norm": 0.24882667930071267,
"learning_rate": 9.618757917633722e-07,
"loss": 0.0,
"num_tokens": 24499558.0,
"reward": 0.75,
"reward_std": 0.18793118000030518,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 148
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.7639203017764606e-10,
"advantages/std": 0.618585467338562,
"advantages/var": 0.38264798040246717,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.6360725720384205,
"grad_norm": 0.25328994588616843,
"learning_rate": 9.613621391708097e-07,
"loss": 0.0,
"num_tokens": 24672886.0,
"reward": 0.53125,
"reward_std": 0.20699402689933777,
"rewards/drgrpo_math_reward/mean": 0.53125,
"rewards/drgrpo_math_reward/std": 0.5,
"step": 149
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196822550470801e-09,
"advantages/std": 0.5726835131645203,
"advantages/var": 0.32796640625045725,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.546875,
"epoch": 0.6403415154749199,
"grad_norm": 0.310149121041335,
"learning_rate": 9.608451884639248e-07,
"loss": 0.0,
"num_tokens": 24837025.0,
"reward": 0.6171875,
"reward_std": 0.15991678833961487,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 150
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 4.8931345825649164e-09,
"advantages/std": 0.6185806393623352,
"advantages/var": 0.3826420073939154,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 0.6446104589114194,
"grad_norm": 0.27449635019824886,
"learning_rate": 9.603249433382144e-07,
"loss": 0.0,
"num_tokens": 24996536.0,
"reward": 0.62890625,
"reward_std": 0.20186668634414673,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 151
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 6.337556201943589e-09,
"advantages/std": 0.6612882614135742,
"advantages/var": 0.4373021646833877,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.6488794023479189,
"grad_norm": 0.22105427222019133,
"learning_rate": 9.598014075127264e-07,
"loss": -0.0,
"num_tokens": 25172186.0,
"reward": 0.6171875,
"reward_std": 0.22738119959831238,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 152
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 9.581590371992046e-09,
"advantages/std": 0.43739622831344604,
"advantages/var": 0.19131546054282822,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.6531483457844184,
"grad_norm": 0.2409562006634107,
"learning_rate": 9.592745847300331e-07,
"loss": 0.0,
"num_tokens": 25323454.0,
"reward": 0.6953125,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 153
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.9917335935919526e-09,
"advantages/std": 0.4675939381122589,
"advantages/var": 0.21864409095933102,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 0.6574172892209178,
"grad_norm": 0.22997997409735074,
"learning_rate": 9.587444787562037e-07,
"loss": -0.0,
"num_tokens": 25492051.0,
"reward": 0.6484375,
"reward_std": 0.1065337061882019,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 154
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196999005160248e-09,
"advantages/std": 0.5726752281188965,
"advantages/var": 0.3279569169010301,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 0.6616862326574173,
"grad_norm": 0.24108914953717586,
"learning_rate": 9.582110933807776e-07,
"loss": 0.0,
"num_tokens": 25657409.0,
"reward": 0.56640625,
"reward_std": 0.15019746124744415,
"rewards/drgrpo_math_reward/mean": 0.56640625,
"rewards/drgrpo_math_reward/std": 0.4965413510799408,
"step": 155
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.6402879357337952,
"advantages/var": 0.4099686406462446,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.6659551760939167,
"grad_norm": 0.24492241583107927,
"learning_rate": 9.576744324167378e-07,
"loss": 0.0,
"num_tokens": 25817785.0,
"reward": 0.71875,
"reward_std": 0.2106773555278778,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 156
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.272752356351317e-09,
"advantages/std": 0.6402793526649475,
"advantages/var": 0.4099576494490442,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.6702241195304163,
"grad_norm": 0.2652156850382761,
"learning_rate": 9.571344997004831e-07,
"loss": -0.0,
"num_tokens": 25981209.0,
"reward": 0.67578125,
"reward_std": 0.19872015714645386,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 157
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 5.985519380243054e-09,
"advantages/std": 0.6612827777862549,
"advantages/var": 0.43729491219670535,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 0.6744930629669157,
"grad_norm": 0.24167674819050053,
"learning_rate": 9.565912990918014e-07,
"loss": 0.0,
"num_tokens": 26153857.0,
"reward": 0.671875,
"reward_std": 0.2205463945865631,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 158
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 8.917546993755515e-09,
"advantages/std": 0.5482946634292603,
"advantages/var": 0.3006270379450058,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 0.6787620064034151,
"grad_norm": 0.25223176431793914,
"learning_rate": 9.560448344738409e-07,
"loss": 0.0,
"num_tokens": 26318338.0,
"reward": 0.7265625,
"reward_std": 0.1369110345840454,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 159
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.9917699002625455e-09,
"advantages/std": 0.46758541464805603,
"advantages/var": 0.2186361199915945,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 0.6830309498399146,
"grad_norm": 0.23792049639198487,
"learning_rate": 9.554951097530832e-07,
"loss": 0.0,
"num_tokens": 26439011.0,
"reward": 0.8203125,
"reward_std": 0.09863808751106262,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 160
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.48964112740099e-09,
"advantages/std": 0.4675987958908081,
"advantages/var": 0.21864863391853362,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.687299893276414,
"grad_norm": 0.17305993617314033,
"learning_rate": 9.549421288593157e-07,
"loss": -0.0,
"num_tokens": 26599069.0,
"reward": 0.62890625,
"reward_std": 0.11112815886735916,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 161
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.131161717523846e-10,
"advantages/std": 0.5726872682571411,
"advantages/var": 0.3279707072238267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.6915688367129136,
"grad_norm": 0.1947637979578463,
"learning_rate": 9.543858957456025e-07,
"loss": 0.0,
"num_tokens": 26749602.0,
"reward": 0.74609375,
"reward_std": 0.1643950194120407,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 162
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.4896779409655654e-09,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.40625,
"epoch": 0.695837780149413,
"grad_norm": 0.17657640815068512,
"learning_rate": 9.53826414388257e-07,
"loss": 0.0,
"num_tokens": 26912977.0,
"reward": 0.64453125,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 163
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.5624695056307517e-09,
"advantages/std": 0.5960580706596375,
"advantages/var": 0.35528522359848935,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 0.7001067235859125,
"grad_norm": 0.3028446810703845,
"learning_rate": 9.532636887868132e-07,
"loss": -0.0,
"num_tokens": 27055838.0,
"reward": 0.7109375,
"reward_std": 0.16124600172042847,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 164
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.511192125942753e-09,
"advantages/std": 0.49596524238586426,
"advantages/var": 0.24598152165486908,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 0.7043756670224119,
"grad_norm": 0.24471970204004162,
"learning_rate": 9.526977229639965e-07,
"loss": 0.0,
"num_tokens": 27206332.0,
"reward": 0.70703125,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 165
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 7.041900874137961e-09,
"advantages/std": 0.49595409631729126,
"advantages/var": 0.24597046565390102,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 0.7086446104589115,
"grad_norm": 0.21906254322665267,
"learning_rate": 9.521285209656962e-07,
"loss": -0.0,
"num_tokens": 27351069.0,
"reward": 0.6328125,
"reward_std": 0.11481393873691559,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 166
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.812142072827069e-10,
"advantages/std": 0.5960737466812134,
"advantages/var": 0.35530391148257934,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 0.7129135538954109,
"grad_norm": 0.2553570949745252,
"learning_rate": 9.515560868609352e-07,
"loss": 0.0,
"num_tokens": 27492145.0,
"reward": 0.8046875,
"reward_std": 0.18056842684745789,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 167
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 3.387563197817429e-09,
"advantages/std": 0.618579089641571,
"advantages/var": 0.3826400901417948,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.7171824973319103,
"grad_norm": 0.23929974194308415,
"learning_rate": 9.509804247418421e-07,
"loss": -0.0,
"num_tokens": 27659931.0,
"reward": 0.625,
"reward_std": 0.1990984082221985,
"rewards/drgrpo_math_reward/mean": 0.625,
"rewards/drgrpo_math_reward/std": 0.4850712716579437,
"step": 168
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.520259612560154e-09,
"advantages/std": 0.5483072400093079,
"advantages/var": 0.30064082944662474,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 0.7214514407684098,
"grad_norm": 0.2769979980823483,
"learning_rate": 9.504015387236213e-07,
"loss": -0.0,
"num_tokens": 27801772.0,
"reward": 0.796875,
"reward_std": 0.15110857784748077,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 169
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.272636100449617e-10,
"advantages/std": 0.6402922868728638,
"advantages/var": 0.4099742126288817,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 0.7257203842049093,
"grad_norm": 0.29082131808158546,
"learning_rate": 9.498194329445234e-07,
"loss": -0.0,
"num_tokens": 27973134.0,
"reward": 0.53125,
"reward_std": 0.21686306595802307,
"rewards/drgrpo_math_reward/mean": 0.53125,
"rewards/drgrpo_math_reward/std": 0.5,
"step": 170
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.5629763089128757e-09,
"advantages/std": 0.5227778553962708,
"advantages/var": 0.2732966860927242,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.7299893276414088,
"grad_norm": 0.24172170954180977,
"learning_rate": 9.492341115658165e-07,
"loss": 0.0,
"num_tokens": 28131313.0,
"reward": 0.796875,
"reward_std": 0.12468298524618149,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 171
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5960737466812134,
"advantages/var": 0.35530391148257934,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.515625,
"epoch": 0.7342582710779082,
"grad_norm": 0.24724705785330842,
"learning_rate": 9.486455787717555e-07,
"loss": 0.0,
"num_tokens": 28314443.0,
"reward": 0.6171875,
"reward_std": 0.18056842684745789,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 172
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.979316211871183e-10,
"advantages/std": 0.4675956070423126,
"advantages/var": 0.21864565172526884,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5,
"epoch": 0.7385272145144077,
"grad_norm": 0.14575457358998883,
"learning_rate": 9.480538387695524e-07,
"loss": 0.0,
"num_tokens": 28486372.0,
"reward": 0.671875,
"reward_std": 0.10877159237861633,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 173
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.225042778153291e-09,
"advantages/std": 0.4959655702114105,
"advantages/var": 0.24598184683512958,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.7427961579509071,
"grad_norm": 0.30035164580113827,
"learning_rate": 9.47458895789347e-07,
"loss": 0.0,
"num_tokens": 28638341.0,
"reward": 0.71875,
"reward_std": 0.12730157375335693,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 174
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.1292017686950893e-09,
"advantages/std": 0.618571400642395,
"advantages/var": 0.3826305776926944,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.453125,
"epoch": 0.7470651013874067,
"grad_norm": 0.26582968832894643,
"learning_rate": 9.468607540841753e-07,
"loss": 0.0,
"num_tokens": 28802994.0,
"reward": 0.62109375,
"reward_std": 0.18937908113002777,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 175
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 1.1133803351426107e-08,
"advantages/std": 0.5228012204170227,
"advantages/var": 0.27332111606952836,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 0.7513340448239061,
"grad_norm": 0.15052650271144502,
"learning_rate": 9.462594179299405e-07,
"loss": 0.0,
"num_tokens": 28970090.0,
"reward": 0.71875,
"reward_std": 0.14860230684280396,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 176
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 2.1292782847223813e-09,
"advantages/std": 0.4373888373374939,
"advantages/var": 0.1913089950274447,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 0.7556029882604055,
"grad_norm": 0.15101127830494457,
"learning_rate": 9.456548916253814e-07,
"loss": -0.0,
"num_tokens": 29133620.0,
"reward": 0.640625,
"reward_std": 0.0875919908285141,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 177
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.6348574440747036e-09,
"advantages/std": 0.6185588836669922,
"advantages/var": 0.3826150925633556,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 0.759871931696905,
"grad_norm": 0.23053809270745587,
"learning_rate": 9.450471794920424e-07,
"loss": -0.0,
"num_tokens": 29314272.0,
"reward": 0.58984375,
"reward_std": 0.1745324432849884,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 178
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.661603659212928e-09,
"advantages/std": 0.4373878836631775,
"advantages/var": 0.1913081607753533,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 0.7641408751334045,
"grad_norm": 0.24006538352367196,
"learning_rate": 9.444362858742416e-07,
"loss": -0.0,
"num_tokens": 29452831.0,
"reward": 0.7578125,
"reward_std": 0.08811995387077332,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 179
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.7604386253420069e-09,
"advantages/std": 0.6612858772277832,
"advantages/var": 0.43729901142091876,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 0.768409818569904,
"grad_norm": 0.26441195567315334,
"learning_rate": 9.438222151390412e-07,
"loss": -0.0,
"num_tokens": 29616682.0,
"reward": 0.640625,
"reward_std": 0.22290541231632233,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 180
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.286176964687214e-09,
"advantages/std": 0.49596065282821655,
"advantages/var": 0.24597696915379075,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 0.7726787620064034,
"grad_norm": 0.22092294807165377,
"learning_rate": 9.432049716762149e-07,
"loss": -0.0,
"num_tokens": 29766007.0,
"reward": 0.71875,
"reward_std": 0.12217669934034348,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 181
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.527988578946327e-10,
"advantages/std": 0.6185733079910278,
"advantages/var": 0.382632937358963,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 0.7769477054429029,
"grad_norm": 0.24586197971513324,
"learning_rate": 9.425845598982176e-07,
"loss": -0.0,
"num_tokens": 29912855.0,
"reward": 0.72265625,
"reward_std": 0.1910889893770218,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 182
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.6614848777545554e-09,
"advantages/std": 0.43740740418434143,
"advantages/var": 0.19132523723528383,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 0.7812166488794023,
"grad_norm": 0.1573793941424989,
"learning_rate": 9.419609842401528e-07,
"loss": -0.0,
"num_tokens": 30075024.0,
"reward": 0.6640625,
"reward_std": 0.10520447790622711,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 183
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.2596858417853543e-09,
"advantages/std": 0.36966460943222046,
"advantages/var": 0.1366519234666761,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.7854855923159018,
"grad_norm": 0.15453159134925487,
"learning_rate": 9.413342491597418e-07,
"loss": -0.0,
"num_tokens": 30234875.0,
"reward": 0.71484375,
"reward_std": 0.0665532797574997,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 184
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.246388168920893e-10,
"advantages/std": 0.5483027696609497,
"advantages/var": 0.30063592721786847,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 0.7897545357524013,
"grad_norm": 0.2528426577376825,
"learning_rate": 9.407043591372916e-07,
"loss": 0.0,
"num_tokens": 30385131.0,
"reward": 0.734375,
"reward_std": 0.1459837108850479,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 185
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.944999661654864e-09,
"advantages/std": 0.5482975840568542,
"advantages/var": 0.30063024068258315,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 0.7940234791889007,
"grad_norm": 0.2395271449170319,
"learning_rate": 9.400713186756623e-07,
"loss": 0.0,
"num_tokens": 30550599.0,
"reward": 0.703125,
"reward_std": 0.14138680696487427,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 186
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.0112142916763455e-09,
"advantages/std": 0.6185694336891174,
"advantages/var": 0.38262814429447545,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 0.7982924226254002,
"grad_norm": 0.22912614754777264,
"learning_rate": 9.39435132300236e-07,
"loss": 0.0,
"num_tokens": 30731714.0,
"reward": 0.60546875,
"reward_std": 0.18596413731575012,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 187
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.492741255292344e-09,
"advantages/std": 0.5483050346374512,
"advantages/var": 0.30063841100877653,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 0.8025613660618997,
"grad_norm": 0.2553756632167738,
"learning_rate": 9.387958045588835e-07,
"loss": 0.0,
"num_tokens": 30884902.0,
"reward": 0.6953125,
"reward_std": 0.14939865469932556,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 188
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.3236407809283777e-09,
"advantages/std": 0.7014055252075195,
"advantages/var": 0.4919697107916363,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.484375,
"epoch": 0.8068303094983992,
"grad_norm": 0.24399140874857214,
"learning_rate": 9.381533400219317e-07,
"loss": 0.0,
"num_tokens": 31067486.0,
"reward": 0.58203125,
"reward_std": 0.2597304582595825,
"rewards/drgrpo_math_reward/mean": 0.58203125,
"rewards/drgrpo_math_reward/std": 0.49419113993644714,
"step": 189
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196689260745064e-09,
"advantages/std": 0.5726897716522217,
"advantages/var": 0.3279735745550738,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 0.8110992529348986,
"grad_norm": 0.26704959527215627,
"learning_rate": 9.375077432821321e-07,
"loss": 0.0,
"num_tokens": 31212571.0,
"reward": 0.6328125,
"reward_std": 0.16834037005901337,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 190
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.907071115042357e-10,
"advantages/std": 0.522799551486969,
"advantages/var": 0.27331937103497594,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 0.8153681963713981,
"grad_norm": 0.253644840740441,
"learning_rate": 9.368590189546267e-07,
"loss": -0.0,
"num_tokens": 31362826.0,
"reward": 0.7109375,
"reward_std": 0.14795321226119995,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 191
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.9530591491397785e-09,
"advantages/std": 0.5960665345191956,
"advantages/var": 0.35529531357372335,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 0.8196371398078975,
"grad_norm": 0.2227309105410048,
"learning_rate": 9.362071716769158e-07,
"loss": -0.0,
"num_tokens": 31535677.0,
"reward": 0.60546875,
"reward_std": 0.1720261573791504,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 192
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 4.72739521502879e-09,
"advantages/std": 0.6402676701545715,
"advantages/var": 0.4099426894451632,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.823906083244397,
"grad_norm": 0.2681728150646929,
"learning_rate": 9.355522061088241e-07,
"loss": 0.0,
"num_tokens": 31704414.0,
"reward": 0.6015625,
"reward_std": 0.1845201551914215,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"step": 193
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.2853073748283135e-09,
"advantages/std": 0.5726816058158875,
"advantages/var": 0.3279642216398635,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.8281750266808965,
"grad_norm": 0.25600457019311335,
"learning_rate": 9.348941269324686e-07,
"loss": -0.0,
"num_tokens": 31871378.0,
"reward": 0.62109375,
"reward_std": 0.15703225135803223,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 194
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 4.898922066460967e-09,
"advantages/std": 0.5227960348129272,
"advantages/var": 0.27331569401611944,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.8324439701173959,
"grad_norm": 0.17012108669775766,
"learning_rate": 9.342329388522237e-07,
"loss": -0.0,
"num_tokens": 32037929.0,
"reward": 0.66796875,
"reward_std": 0.14453580975532532,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 195
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.4686569410831624e-09,
"advantages/std": 0.5960565805435181,
"advantages/var": 0.35528344720923144,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 0.8367129135538954,
"grad_norm": 0.31281434138995456,
"learning_rate": 9.335686465946886e-07,
"loss": 0.0,
"num_tokens": 32174851.0,
"reward": 0.7734375,
"reward_std": 0.16059692203998566,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 196
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.7639801445415536e-10,
"advantages/std": 0.6185756325721741,
"advantages/var": 0.3826358132120653,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.8409818569903948,
"grad_norm": 0.24160142918946306,
"learning_rate": 9.32901254908653e-07,
"loss": 0.0,
"num_tokens": 32342089.0,
"reward": 0.7109375,
"reward_std": 0.19503435492515564,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 197
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.3361013392583426e-09,
"advantages/std": 0.522783637046814,
"advantages/var": 0.2733027311638949,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.59375,
"epoch": 0.8452508004268944,
"grad_norm": 0.21258248842995997,
"learning_rate": 9.322307685650637e-07,
"loss": 0.0,
"num_tokens": 32498458.0,
"reward": 0.69921875,
"reward_std": 0.12981030344963074,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 198
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.246439870363116e-09,
"advantages/std": 0.5482960939407349,
"advantages/var": 0.30062860663066715,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 0.8495197438633938,
"grad_norm": 0.20952831368827907,
"learning_rate": 9.315571923569892e-07,
"loss": -0.0,
"num_tokens": 32642747.0,
"reward": 0.7109375,
"reward_std": 0.13914892077445984,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 199
},
{
"advantages/mean": -6.752088665962219e-09,
"advantages/snr": 1.0210589122953355e-08,
"advantages/std": 0.6612829566001892,
"advantages/var": 0.4372951486898877,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.8537886872998933,
"grad_norm": 0.30185620500016624,
"learning_rate": 9.308805310995875e-07,
"loss": 0.0,
"num_tokens": 32794582.0,
"reward": 0.73828125,
"reward_std": 0.21937178075313568,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 200
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.734251092552386e-09,
"advantages/std": 0.5960734486579895,
"advantages/var": 0.35530355619502885,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.8580576307363927,
"grad_norm": 0.2462816018228872,
"learning_rate": 9.302007896300697e-07,
"loss": -0.0,
"num_tokens": 32961774.0,
"reward": 0.63671875,
"reward_std": 0.18003800511360168,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 201
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.599654419242252e-09,
"advantages/std": 0.404953271150589,
"advantages/var": 0.16398715181556245,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 0.8623265741728922,
"grad_norm": 0.16802665427080618,
"learning_rate": 9.295179728076665e-07,
"loss": 0.0,
"num_tokens": 33116837.0,
"reward": 0.72265625,
"reward_std": 0.08390620350837708,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 202
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 4.516864526265062e-09,
"advantages/std": 0.6185635328292847,
"advantages/var": 0.38262084414624553,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.8665955176093917,
"grad_norm": 0.25804002111389396,
"learning_rate": 9.288320855135934e-07,
"loss": -0.0,
"num_tokens": 33286278.0,
"reward": 0.61328125,
"reward_std": 0.17912934720516205,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 203
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.991711378624568e-09,
"advantages/std": 0.46759915351867676,
"advantages/var": 0.21864896837138303,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 0.8708644610458911,
"grad_norm": 0.22827939959255067,
"learning_rate": 9.281431326510152e-07,
"loss": -0.0,
"num_tokens": 33443163.0,
"reward": 0.6640625,
"reward_std": 0.11165857315063477,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 204
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 7.0310319200493365e-09,
"advantages/std": 0.5960649251937866,
"advantages/var": 0.35529339504627444,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 0.8751334044823906,
"grad_norm": 0.22458008637077748,
"learning_rate": 9.274511191450119e-07,
"loss": -0.0,
"num_tokens": 33610355.0,
"reward": 0.6875,
"reward_std": 0.16925784945487976,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 205
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 7.643394011345789e-09,
"advantages/std": 0.5483102798461914,
"advantages/var": 0.30064416298500873,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.87940234791889,
"grad_norm": 0.2290235050594148,
"learning_rate": 9.267560499425424e-07,
"loss": 0.0,
"num_tokens": 33762863.0,
"reward": 0.7734375,
"reward_std": 0.1539955586194992,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 206
},
{
"advantages/mean": -5.122274160385132e-09,
"advantages/snr": 7.514663704136494e-09,
"advantages/std": 0.681637167930603,
"advantages/var": 0.4646292287044531,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.8836712913553896,
"grad_norm": 0.2785376078782206,
"learning_rate": 9.260579300124099e-07,
"loss": 0.0,
"num_tokens": 33917344.0,
"reward": 0.671875,
"reward_std": 0.23672226071357727,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 207
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.812217066674933e-10,
"advantages/std": 0.5960680246353149,
"advantages/var": 0.3552970899926464,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5,
"epoch": 0.887940234791889,
"grad_norm": 0.29013235358343903,
"learning_rate": 9.253567643452262e-07,
"loss": 0.0,
"num_tokens": 34092208.0,
"reward": 0.59765625,
"reward_std": 0.1726752519607544,
"rewards/drgrpo_math_reward/mean": 0.59765625,
"rewards/drgrpo_math_reward/std": 0.4913311004638672,
"step": 208
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.449885533156584e-09,
"advantages/std": 0.404936283826828,
"advantages/var": 0.1639733939594814,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 0.8922091782283885,
"grad_norm": 0.2085125843665539,
"learning_rate": 9.246525579533764e-07,
"loss": 0.0,
"num_tokens": 34244162.0,
"reward": 0.64453125,
"reward_std": 0.06970865279436111,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 209
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5483095049858093,
"advantages/var": 0.30064331325778326,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.8964781216648879,
"grad_norm": 0.18475872134993088,
"learning_rate": 9.23945315870982e-07,
"loss": 0.0,
"num_tokens": 34403014.0,
"reward": 0.6484375,
"reward_std": 0.15452352166175842,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 210
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.859129990066525e-09,
"advantages/std": 0.5960713624954224,
"advantages/var": 0.3553010691871492,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.9007470651013874,
"grad_norm": 0.29702733030568923,
"learning_rate": 9.232350431538656e-07,
"loss": 0.0,
"num_tokens": 34564823.0,
"reward": 0.67578125,
"reward_std": 0.17662307620048523,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 211
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.5624249773879439e-09,
"advantages/std": 0.5960750579833984,
"advantages/var": 0.3553054747499118,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.515625,
"epoch": 0.9050160085378869,
"grad_norm": 0.21062195720897037,
"learning_rate": 9.225217448795154e-07,
"loss": 0.0,
"num_tokens": 34741244.0,
"reward": 0.6640625,
"reward_std": 0.18280631303787231,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 212
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.3306388556957245,
"advantages/var": 0.10932205289577812,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 0.9092849519743863,
"grad_norm": 0.14741888821412977,
"learning_rate": 9.218054261470476e-07,
"loss": 0.0,
"num_tokens": 34873153.0,
"reward": 0.796875,
"reward_std": 0.05326685309410095,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 213
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 4.453650819722754e-09,
"advantages/std": 0.522786021232605,
"advantages/var": 0.2733052239962177,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 0.9135538954108858,
"grad_norm": 0.15938017835116514,
"learning_rate": 9.210860920771705e-07,
"loss": -0.0,
"num_tokens": 35048090.0,
"reward": 0.60546875,
"reward_std": 0.13322526216506958,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 214
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 8.492832655171976e-10,
"advantages/std": 0.5482991337776184,
"advantages/var": 0.3006319401012867,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 0.9178228388473852,
"grad_norm": 0.22427508241794308,
"learning_rate": 9.203637478121491e-07,
"loss": 0.0,
"num_tokens": 35218953.0,
"reward": 0.6015625,
"reward_std": 0.14203590154647827,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"step": 215
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 7.151580191519881e-09,
"advantages/std": 0.6185740828514099,
"advantages/var": 0.38263389597546293,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 0.9220917822838848,
"grad_norm": 0.20779930017559525,
"learning_rate": 9.196383985157656e-07,
"loss": 0.0,
"num_tokens": 35366948.0,
"reward": 0.78515625,
"reward_std": 0.1922660619020462,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 216
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2739017715075114e-09,
"advantages/std": 0.5483090877532959,
"advantages/var": 0.30064285571285154,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 0.9263607257203842,
"grad_norm": 0.22856666349627422,
"learning_rate": 9.189100493732851e-07,
"loss": -0.0,
"num_tokens": 35510655.0,
"reward": 0.77734375,
"reward_std": 0.15228809416294098,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 217
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.3971400787152124e-09,
"advantages/std": 0.5482980012893677,
"advantages/var": 0.30063069821791544,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 0.9306296691568837,
"grad_norm": 0.22475360749870224,
"learning_rate": 9.181787055914175e-07,
"loss": 0.0,
"num_tokens": 35667550.0,
"reward": 0.48046875,
"reward_std": 0.14032843708992004,
"rewards/drgrpo_math_reward/mean": 0.48046875,
"rewards/drgrpo_math_reward/std": 0.5005971193313599,
"step": 218
},
{
"advantages/mean": -6.05359673500061e-09,
"advantages/snr": 9.786289910659712e-09,
"advantages/std": 0.6185793280601501,
"advantages/var": 0.38264038510334686,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.59375,
"epoch": 0.9348986125933831,
"grad_norm": 0.244956248158164,
"learning_rate": 9.174443723982799e-07,
"loss": 0.0,
"num_tokens": 35833973.0,
"reward": 0.65234375,
"reward_std": 0.1996288150548935,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 219
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 7.636313400668574e-09,
"advantages/std": 0.6402884721755981,
"advantages/var": 0.4099693276009617,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 0.9391675560298826,
"grad_norm": 0.24276848059353862,
"learning_rate": 9.167070550433602e-07,
"loss": 0.0,
"num_tokens": 35997006.0,
"reward": 0.703125,
"reward_std": 0.21173818409442902,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 220
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.022429163667084e-09,
"advantages/std": 0.6185693740844727,
"advantages/var": 0.38262807055525627,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 0.9434364994663821,
"grad_norm": 0.23098088315813128,
"learning_rate": 9.159667587974785e-07,
"loss": 0.0,
"num_tokens": 36144820.0,
"reward": 0.73828125,
"reward_std": 0.18596413731575012,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 221
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755638038437843e-09,
"advantages/std": 0.49595969915390015,
"advantages/var": 0.24597602318482714,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.9477054429028815,
"grad_norm": 0.203738257085365,
"learning_rate": 9.152234889527501e-07,
"loss": -0.0,
"num_tokens": 36302905.0,
"reward": 0.7265625,
"reward_std": 0.12099964916706085,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 222
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.528071999011651e-10,
"advantages/std": 0.6185664534568787,
"advantages/var": 0.38262445734222084,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 0.951974386339381,
"grad_norm": 0.2859031051149433,
"learning_rate": 9.144772508225476e-07,
"loss": -0.0,
"num_tokens": 36465340.0,
"reward": 0.609375,
"reward_std": 0.1825467348098755,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 223
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 4.893158628587196e-09,
"advantages/std": 0.6185775995254517,
"advantages/var": 0.38263824663467005,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.9562433297758804,
"grad_norm": 0.33728645483097885,
"learning_rate": 9.137280497414628e-07,
"loss": 0.0,
"num_tokens": 36622798.0,
"reward": 0.71875,
"reward_std": 0.19674429297447205,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 224
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.269580834375063e-09,
"advantages/std": 0.6185746192932129,
"advantages/var": 0.38263455963374327,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 0.96051227321238,
"grad_norm": 0.23678336081671283,
"learning_rate": 9.129758910652683e-07,
"loss": 0.0,
"num_tokens": 36787184.0,
"reward": 0.65234375,
"reward_std": 0.1933268904685974,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 225
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.5478913430760094e-09,
"advantages/std": 0.5482901930809021,
"advantages/var": 0.3006221358286929,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 0.9647812166488794,
"grad_norm": 0.23416868197278523,
"learning_rate": 9.122207801708801e-07,
"loss": -0.0,
"num_tokens": 36934154.0,
"reward": 0.703125,
"reward_std": 0.13178616762161255,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 226
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.5624437258124148e-09,
"advantages/std": 0.5960679054260254,
"advantages/var": 0.35529694787896915,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 0.9690501600853789,
"grad_norm": 0.24178276531312257,
"learning_rate": 9.114627224563181e-07,
"loss": 0.0,
"num_tokens": 37097501.0,
"reward": 0.71484375,
"reward_std": 0.17097023129463196,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 227
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.389007647043238e-10,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 0.9733191035218783,
"grad_norm": 0.21515667717388337,
"learning_rate": 9.10701723340668e-07,
"loss": 0.0,
"num_tokens": 37255645.0,
"reward": 0.81640625,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 228
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.972471718244625e-09,
"advantages/std": 0.5483027696609497,
"advantages/var": 0.30063592721786847,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 0.9775880469583778,
"grad_norm": 0.2875591708504233,
"learning_rate": 9.099377882640424e-07,
"loss": 0.0,
"num_tokens": 37391494.0,
"reward": 0.8046875,
"reward_std": 0.1459837108850479,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 229
},
{
"advantages/mean": 3.4924596548080444e-09,
"advantages/snr": 6.3695981791179415e-09,
"advantages/std": 0.5483013987541199,
"advantages/var": 0.30063442387572437,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.375,
"epoch": 0.9818569903948773,
"grad_norm": 0.23429010911438367,
"learning_rate": 9.091709226875428e-07,
"loss": -0.0,
"num_tokens": 37564614.0,
"reward": 0.609375,
"reward_std": 0.14545084536075592,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 230
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.2861422109191467e-09,
"advantages/std": 0.4959658980369568,
"advantages/var": 0.24598217201560502,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 0.9861259338313767,
"grad_norm": 0.21210491205337761,
"learning_rate": 9.084011320932188e-07,
"loss": -0.0,
"num_tokens": 37722632.0,
"reward": 0.63671875,
"reward_std": 0.12783199548721313,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 231
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 3.6364006470548067e-10,
"advantages/std": 0.6402777433395386,
"advantages/var": 0.40995558861597203,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 0.9903948772678762,
"grad_norm": 0.3265367211352305,
"learning_rate": 9.076284219840304e-07,
"loss": -0.0,
"num_tokens": 37898519.0,
"reward": 0.53125,
"reward_std": 0.19754064083099365,
"rewards/drgrpo_math_reward/mean": 0.53125,
"rewards/drgrpo_math_reward/std": 0.5,
"step": 232
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 1.1178199929019476e-08,
"advantages/std": 0.43740883469581604,
"advantages/var": 0.19132648866995172,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 0.9946638207043756,
"grad_norm": 0.20031446384378207,
"learning_rate": 9.068527978838084e-07,
"loss": 0.0,
"num_tokens": 38052365.0,
"reward": 0.73828125,
"reward_std": 0.10691195726394653,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 233
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.919882841693015e-09,
"advantages/std": 0.4374060034751892,
"advantages/var": 0.19132401187613723,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 0.9989327641408752,
"grad_norm": 0.18502191003874308,
"learning_rate": 9.060742653372142e-07,
"loss": -0.0,
"num_tokens": 38203481.0,
"reward": 0.70703125,
"reward_std": 0.10349701344966888,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 234
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.572685182094574,
"advantages/var": 0.32796831779069535,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.0042689434364995,
"grad_norm": 0.2681599551540745,
"learning_rate": 9.052928299097012e-07,
"loss": -0.0,
"num_tokens": 38362882.0,
"reward": 0.62109375,
"reward_std": 0.1626850962638855,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 235
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.9724594393523208e-09,
"advantages/std": 0.5483050346374512,
"advantages/var": 0.30063841100877653,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 1.0085378868729988,
"grad_norm": 0.23210538289797444,
"learning_rate": 9.045084971874737e-07,
"loss": 0.0,
"num_tokens": 38502567.0,
"reward": 0.796875,
"reward_std": 0.14939865469932556,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 236
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.8181795048296057e-09,
"advantages/std": 0.640285074710846,
"advantages/var": 0.4099649768974736,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.0128068303094984,
"grad_norm": 0.2611427222928886,
"learning_rate": 9.037212727774485e-07,
"loss": 0.0,
"num_tokens": 38659097.0,
"reward": 0.68359375,
"reward_std": 0.20725995302200317,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 237
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.7814572812374478e-09,
"advantages/std": 0.5227869153022766,
"advantages/var": 0.27330615881126974,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 1.017075773745998,
"grad_norm": 0.19890422356269907,
"learning_rate": 9.029311623072137e-07,
"loss": -0.0,
"num_tokens": 38828844.0,
"reward": 0.67578125,
"reward_std": 0.1344023048877716,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 238
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.944765720258187e-09,
"advantages/std": 0.5483191609382629,
"advantages/var": 0.3006539022520407,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.0213447171824974,
"grad_norm": 0.2278453686272756,
"learning_rate": 9.021381714249887e-07,
"loss": 0.0,
"num_tokens": 38990118.0,
"reward": 0.7734375,
"reward_std": 0.16424530744552612,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 239
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5624385699508314e-09,
"advantages/std": 0.596069872379303,
"advantages/var": 0.35529929275827854,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 1.0256136606189967,
"grad_norm": 0.2542975337210786,
"learning_rate": 9.013423057995844e-07,
"loss": -0.0,
"num_tokens": 39162750.0,
"reward": 0.62890625,
"reward_std": 0.17597398161888123,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 240
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.12489245127268e-09,
"advantages/std": 0.596066951751709,
"advantages/var": 0.35529581097057417,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 1.0298826040554963,
"grad_norm": 0.23443867005042662,
"learning_rate": 9.005435711203618e-07,
"loss": 0.0,
"num_tokens": 39316222.0,
"reward": 0.7890625,
"reward_std": 0.17096778750419617,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 241
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.812088953055965e-10,
"advantages/std": 0.5960777997970581,
"advantages/var": 0.3553087434109017,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.0341515474919958,
"grad_norm": 0.23796595572080942,
"learning_rate": 8.997419730971916e-07,
"loss": -0.0,
"num_tokens": 39464279.0,
"reward": 0.7265625,
"reward_std": 0.18569329380989075,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 242
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1231797745105085e-09,
"advantages/std": 0.5483064651489258,
"advantages/var": 0.30063997972411016,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.038420490928495,
"grad_norm": 0.22286596877086656,
"learning_rate": 8.989375174604141e-07,
"loss": 0.0,
"num_tokens": 39621864.0,
"reward": 0.6484375,
"reward_std": 0.15163654088974,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 243
},
{
"advantages/mean": -5.3551048040390015e-09,
"advantages/snr": 1.0797358794099725e-08,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 1.0426894343649946,
"grad_norm": 0.21043020237501475,
"learning_rate": 8.981302099607972e-07,
"loss": 0.0,
"num_tokens": 39785296.0,
"reward": 0.73828125,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 244
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.2267840268747123e-09,
"advantages/std": 0.5227957367897034,
"advantages/var": 0.2733153824054888,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.0469583778014941,
"grad_norm": 0.19849666793713738,
"learning_rate": 8.973200563694963e-07,
"loss": -0.0,
"num_tokens": 39966617.0,
"reward": 0.640625,
"reward_std": 0.14400538802146912,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 245
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.1175644599137753e-09,
"advantages/std": 0.5227845311164856,
"advantages/var": 0.2733036659746837,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.0512273212379937,
"grad_norm": 0.2371693296860179,
"learning_rate": 8.965070624780115e-07,
"loss": 0.0,
"num_tokens": 40132643.0,
"reward": 0.64453125,
"reward_std": 0.13098734617233276,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 246
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.2463106191180555e-10,
"advantages/std": 0.548312783241272,
"advantages/var": 0.3006469082657901,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.055496264674493,
"grad_norm": 0.23997233729464268,
"learning_rate": 8.956912340981484e-07,
"loss": -0.0,
"num_tokens": 40283074.0,
"reward": 0.72265625,
"reward_std": 0.15623590350151062,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 247
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.971350610904034e-09,
"advantages/std": 0.46757495403289795,
"advantages/var": 0.21862633763886663,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.0597652081109925,
"grad_norm": 0.2525168510699522,
"learning_rate": 8.948725770619744e-07,
"loss": 0.0,
"num_tokens": 40423442.0,
"reward": 0.7578125,
"reward_std": 0.0883883461356163,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 248
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.907167588560933e-10,
"advantages/std": 0.5227938890457153,
"advantages/var": 0.2733134504235437,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 1.064034151547492,
"grad_norm": 0.19726712226980916,
"learning_rate": 8.940510972217785e-07,
"loss": -0.0,
"num_tokens": 40587006.0,
"reward": 0.71484375,
"reward_std": 0.14123709499835968,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 249
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 9.69288060329602e-10,
"advantages/std": 0.7206236720085144,
"advantages/var": 0.519298476659035,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.0683030949839916,
"grad_norm": 0.2883836841361683,
"learning_rate": 8.932268004500287e-07,
"loss": -0.0,
"num_tokens": 40751302.0,
"reward": 0.72265625,
"reward_std": 0.2730144262313843,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 250
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 5.095673003899881e-09,
"advantages/std": 0.5483019948005676,
"advantages/var": 0.3006350775022817,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 1.0725720384204909,
"grad_norm": 0.2146721541501632,
"learning_rate": 8.923996926393305e-07,
"loss": -0.0,
"num_tokens": 40902937.0,
"reward": 0.6796875,
"reward_std": 0.14651167392730713,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 251
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.987671798143373e-09,
"advantages/std": 0.4675827622413635,
"advantages/var": 0.2186336395452635,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 1.0768409818569904,
"grad_norm": 0.20976273414921642,
"learning_rate": 8.91569779702384e-07,
"loss": -0.0,
"num_tokens": 41060790.0,
"reward": 0.671875,
"reward_std": 0.09522314369678497,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 252
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.6918481539330074e-09,
"advantages/std": 0.5726837515830994,
"advantages/var": 0.32796667932729306,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.08110992529349,
"grad_norm": 0.2813986497841566,
"learning_rate": 8.907370675719427e-07,
"loss": 0.0,
"num_tokens": 41208791.0,
"reward": 0.69921875,
"reward_std": 0.16044719517230988,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 253
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.9723957846157257e-09,
"advantages/std": 0.5483167767524719,
"advantages/var": 0.30065128766822014,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.0853788687299892,
"grad_norm": 0.2259849805731124,
"learning_rate": 8.899015622007702e-07,
"loss": -0.0,
"num_tokens": 41363716.0,
"reward": 0.640625,
"reward_std": 0.1624191403388977,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 254
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.408345822654131e-09,
"advantages/std": 0.6612882614135742,
"advantages/var": 0.4373021646833877,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.0896478121664888,
"grad_norm": 0.2887201039560959,
"learning_rate": 8.890632695615982e-07,
"loss": -0.0,
"num_tokens": 41534919.0,
"reward": 0.5859375,
"reward_std": 0.22738116979599,
"rewards/drgrpo_math_reward/mean": 0.5859375,
"rewards/drgrpo_math_reward/std": 0.4935242533683777,
"step": 255
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.1175854313291422e-09,
"advantages/std": 0.5227810144424438,
"advantages/var": 0.2732999890614707,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.0939167556029883,
"grad_norm": 0.19084207845075485,
"learning_rate": 8.882221956470836e-07,
"loss": 0.0,
"num_tokens": 41699529.0,
"reward": 0.640625,
"reward_std": 0.12756995856761932,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 256
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.645987625947185e-09,
"advantages/std": 0.6185737252235413,
"advantages/var": 0.3826334535369291,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 1.0981856990394878,
"grad_norm": 0.32427544566232364,
"learning_rate": 8.873783464697653e-07,
"loss": 0.0,
"num_tokens": 41857360.0,
"reward": 0.6875,
"reward_std": 0.19332444667816162,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 257
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.906072989360551e-09,
"advantages/std": 0.5960734486579895,
"advantages/var": 0.35530355619502885,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.1024546424759871,
"grad_norm": 0.26379798941422533,
"learning_rate": 8.865317280620219e-07,
"loss": 0.0,
"num_tokens": 42022785.0,
"reward": 0.64453125,
"reward_std": 0.18003800511360168,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 258
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 2.987613550456549e-09,
"advantages/std": 0.7013878226280212,
"advantages/var": 0.4919448777308766,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.1067235859124867,
"grad_norm": 0.344142203779848,
"learning_rate": 8.856823464760282e-07,
"loss": -0.0,
"num_tokens": 42171320.0,
"reward": 0.625,
"reward_std": 0.233575701713562,
"rewards/drgrpo_math_reward/mean": 0.625,
"rewards/drgrpo_math_reward/std": 0.4850712716579437,
"step": 259
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196938070589645e-09,
"advantages/std": 0.5726780891418457,
"advantages/var": 0.3279601937831558,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 1.1109925293489862,
"grad_norm": 0.23515318491841494,
"learning_rate": 8.84830207783712e-07,
"loss": 0.0,
"num_tokens": 42335796.0,
"reward": 0.55078125,
"reward_std": 0.1530844271183014,
"rewards/drgrpo_math_reward/mean": 0.55078125,
"rewards/drgrpo_math_reward/std": 0.49838894605636597,
"step": 260
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.90728742173617e-10,
"advantages/std": 0.5227868556976318,
"advantages/var": 0.27330609649021653,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.1152614727854857,
"grad_norm": 0.2014522180480839,
"learning_rate": 8.839753180767107e-07,
"loss": -0.0,
"num_tokens": 42502750.0,
"reward": 0.70703125,
"reward_std": 0.13269728422164917,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 261
},
{
"advantages/mean": -6.984919309616089e-09,
"advantages/snr": 1.2196931723273553e-08,
"advantages/std": 0.5726783871650696,
"advantages/var": 0.32796053512598533,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.119530416221985,
"grad_norm": 0.2783647958076793,
"learning_rate": 8.831176834663273e-07,
"loss": 0.0,
"num_tokens": 42666009.0,
"reward": 0.640625,
"reward_std": 0.1536148637533188,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 262
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.6722203494045257e-09,
"advantages/std": 0.522780179977417,
"advantages/var": 0.2732991165772205,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.1237993596584845,
"grad_norm": 0.22229712386788078,
"learning_rate": 8.822573100834879e-07,
"loss": 0.0,
"num_tokens": 42809029.0,
"reward": 0.78125,
"reward_std": 0.1263929009437561,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 263
},
{
"advantages/mean": 3.725290298461914e-09,
"advantages/snr": 7.1257210721944304e-09,
"advantages/std": 0.5227948427200317,
"advantages/var": 0.2733144475746627,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 1.128068303094984,
"grad_norm": 0.21192413693613738,
"learning_rate": 8.813942040786963e-07,
"loss": -0.0,
"num_tokens": 43000488.0,
"reward": 0.546875,
"reward_std": 0.1428283452987671,
"rewards/drgrpo_math_reward/mean": 0.546875,
"rewards/drgrpo_math_reward/std": 0.4987730085849762,
"step": 264
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.7260719743060095e-09,
"advantages/std": 0.4374082088470459,
"advantages/var": 0.19132594116678092,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 1.1323372465314834,
"grad_norm": 0.18260371159172073,
"learning_rate": 8.805283716219915e-07,
"loss": 0.0,
"num_tokens": 43169988.0,
"reward": 0.625,
"reward_std": 0.10626532137393951,
"rewards/drgrpo_math_reward/mean": 0.625,
"rewards/drgrpo_math_reward/std": 0.4850712716579437,
"step": 265
},
{
"advantages/mean": -5.3551048040390015e-09,
"advantages/snr": 8.984029862526269e-09,
"advantages/std": 0.5960693359375,
"advantages/var": 0.35529865324497223,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.136606189967983,
"grad_norm": 0.24260356114505385,
"learning_rate": 8.796598189029029e-07,
"loss": 0.0,
"num_tokens": 43331520.0,
"reward": 0.72265625,
"reward_std": 0.17491313815116882,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 266
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.789606785993915e-09,
"advantages/std": 0.5227985978126526,
"advantages/var": 0.2733183738748757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.1408751334044824,
"grad_norm": 0.2635927347438612,
"learning_rate": 8.787885521304055e-07,
"loss": 0.0,
"num_tokens": 43490313.0,
"reward": 0.69140625,
"reward_std": 0.14636196196079254,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 267
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 8.068112086076467e-09,
"advantages/std": 0.5483044981956482,
"advantages/var": 0.3006378227415816,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.1451440768409817,
"grad_norm": 0.2815394159637722,
"learning_rate": 8.779145775328764e-07,
"loss": -0.0,
"num_tokens": 43645813.0,
"reward": 0.66796875,
"reward_std": 0.14875200390815735,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 268
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.5477847097207762e-09,
"advantages/std": 0.5483131408691406,
"advantages/var": 0.30064730044978205,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.1494130202774813,
"grad_norm": 0.2345865010716582,
"learning_rate": 8.770379013580507e-07,
"loss": -0.0,
"num_tokens": 43806619.0,
"reward": 0.734375,
"reward_std": 0.15847133100032806,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 269
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.492707096246389e-10,
"advantages/std": 0.5483072400093079,
"advantages/var": 0.30064082944662474,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.1536819637139808,
"grad_norm": 0.24516543455701023,
"learning_rate": 8.761585298729748e-07,
"loss": 0.0,
"num_tokens": 43962231.0,
"reward": 0.703125,
"reward_std": 0.15110856294631958,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 270
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 4.979379684270509e-09,
"advantages/std": 0.4675896465778351,
"advantages/var": 0.21864007758678472,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.1579509071504803,
"grad_norm": 0.24033704608392403,
"learning_rate": 8.752764693639638e-07,
"loss": 0.0,
"num_tokens": 44114241.0,
"reward": 0.671875,
"reward_std": 0.10258589684963226,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 271
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.48964112740099e-09,
"advantages/std": 0.4675987958908081,
"advantages/var": 0.21864863391853362,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.1622198505869796,
"grad_norm": 0.23283406072005608,
"learning_rate": 8.743917261365557e-07,
"loss": 0.0,
"num_tokens": 44280233.0,
"reward": 0.58984375,
"reward_std": 0.11112815141677856,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 272
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.485510240816933e-09,
"advantages/std": 0.467597097158432,
"advantages/var": 0.2186470452709921,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.1664887940234792,
"grad_norm": 0.3827013434118944,
"learning_rate": 8.73504306515466e-07,
"loss": -0.0,
"num_tokens": 44427455.0,
"reward": 0.73828125,
"reward_std": 0.10889027267694473,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 273
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.520296218227876e-09,
"advantages/std": 0.5483036041259766,
"advantages/var": 0.3006368422975356,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.1707577374599787,
"grad_norm": 0.2654311262205045,
"learning_rate": 8.726142168445425e-07,
"loss": 0.0,
"num_tokens": 44576560.0,
"reward": 0.6484375,
"reward_std": 0.14716076850891113,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 274
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.5478470268314738e-09,
"advantages/std": 0.5482997298240662,
"advantages/var": 0.30063259372514395,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 1.1750266808964782,
"grad_norm": 0.2816453673564088,
"learning_rate": 8.717214634867211e-07,
"loss": 0.0,
"num_tokens": 44750347.0,
"reward": 0.6015625,
"reward_std": 0.14309673011302948,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"step": 275
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.634804126955208e-09,
"advantages/std": 0.618571400642395,
"advantages/var": 0.3826305776926944,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.1792956243329775,
"grad_norm": 0.23661445655492572,
"learning_rate": 8.708260528239788e-07,
"loss": -0.0,
"num_tokens": 44920222.0,
"reward": 0.62109375,
"reward_std": 0.18937908113002777,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 276
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 2.816750546713503e-09,
"advantages/std": 0.6612744331359863,
"advantages/var": 0.43728387591932005,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.183564567769477,
"grad_norm": 0.26084562938037026,
"learning_rate": 8.699279912572888e-07,
"loss": -0.0,
"num_tokens": 45097288.0,
"reward": 0.55859375,
"reward_std": 0.20858919620513916,
"rewards/drgrpo_math_reward/mean": 0.55859375,
"rewards/drgrpo_math_reward/std": 0.4975275993347168,
"step": 277
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 5.789625268215301e-09,
"advantages/std": 0.5227969288825989,
"advantages/var": 0.27331662884907715,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 1.1878335112059766,
"grad_norm": 0.25195978563158833,
"learning_rate": 8.690272852065748e-07,
"loss": -0.0,
"num_tokens": 45265747.0,
"reward": 0.67578125,
"reward_std": 0.14571286737918854,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 278
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.344476650405274e-09,
"advantages/std": 0.5227766633033752,
"advantages/var": 0.27329543969461056,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.1921024546424759,
"grad_norm": 0.19475972227751492,
"learning_rate": 8.68123941110665e-07,
"loss": -0.0,
"num_tokens": 45425884.0,
"reward": 0.86328125,
"reward_std": 0.12297550588846207,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"step": 279
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 1.0615910412769516e-08,
"advantages/std": 0.548305869102478,
"advantages/var": 0.30063932609222377,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 1.1963713980789754,
"grad_norm": 0.26798133987683354,
"learning_rate": 8.67217965427246e-07,
"loss": 0.0,
"num_tokens": 45569425.0,
"reward": 0.84375,
"reward_std": 0.15057571232318878,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"step": 280
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5227810144424438,
"advantages/var": 0.2732999890614707,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 1.200640341515475,
"grad_norm": 0.3170370623918755,
"learning_rate": 8.663093646328166e-07,
"loss": 0.0,
"num_tokens": 45728046.0,
"reward": 0.6953125,
"reward_std": 0.12756995856761932,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 281
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.344285320354163e-09,
"advantages/std": 0.5227953791618347,
"advantages/var": 0.2733150084729665,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.2049092849519745,
"grad_norm": 0.24331646146386493,
"learning_rate": 8.653981452226417e-07,
"loss": 0.0,
"num_tokens": 45873852.0,
"reward": 0.76953125,
"reward_std": 0.1434749811887741,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 282
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.285310125301036e-09,
"advantages/std": 0.5726813077926636,
"advantages/var": 0.3279638802951155,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.2091782283884738,
"grad_norm": 0.2142418838550912,
"learning_rate": 8.644843137107057e-07,
"loss": 0.0,
"num_tokens": 46047250.0,
"reward": 0.59375,
"reward_std": 0.15650184452533722,
"rewards/drgrpo_math_reward/mean": 0.59375,
"rewards/drgrpo_math_reward/std": 0.49209436774253845,
"step": 283
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.8553042319251684e-09,
"advantages/std": 0.437404602766037,
"advantages/var": 0.1913227865209146,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.2134471718249733,
"grad_norm": 0.1783701475963193,
"learning_rate": 8.635678766296661e-07,
"loss": 0.0,
"num_tokens": 46205057.0,
"reward": 0.703125,
"reward_std": 0.10178953409194946,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 284
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.5155159600246683e-09,
"advantages/std": 0.5960649251937866,
"advantages/var": 0.35529339504627444,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.2177161152614728,
"grad_norm": 0.2221401615180632,
"learning_rate": 8.626488405308066e-07,
"loss": 0.0,
"num_tokens": 46368520.0,
"reward": 0.6640625,
"reward_std": 0.16925784945487976,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 285
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196389686850863e-09,
"advantages/std": 0.5727038383483887,
"advantages/var": 0.3279896864589773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.2219850586979724,
"grad_norm": 0.24138394163239055,
"learning_rate": 8.617272119839902e-07,
"loss": -0.0,
"num_tokens": 46523291.0,
"reward": 0.7109375,
"reward_std": 0.18424785137176514,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 286
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.5478276389592506e-09,
"advantages/std": 0.5483039021492004,
"advantages/var": 0.30063716911203997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 1.2262540021344717,
"grad_norm": 0.20263774043829955,
"learning_rate": 8.608029975776128e-07,
"loss": -0.0,
"num_tokens": 46678953.0,
"reward": 0.76171875,
"reward_std": 0.14769119024276733,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 287
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.9917351169221575e-09,
"advantages/std": 0.46759358048439026,
"advantages/var": 0.21864375651021195,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.2305229455709712,
"grad_norm": 0.21616322659681173,
"learning_rate": 8.598762039185552e-07,
"loss": 0.0,
"num_tokens": 46830553.0,
"reward": 0.79296875,
"reward_std": 0.1060032919049263,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 288
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.898892464424674e-09,
"advantages/std": 0.5227991938591003,
"advantages/var": 0.2733189970997252,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.2347918890074707,
"grad_norm": 0.19897774941835714,
"learning_rate": 8.589468376321367e-07,
"loss": -0.0,
"num_tokens": 46992262.0,
"reward": 0.66015625,
"reward_std": 0.14742279052734375,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 289
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.9530138407463264e-09,
"advantages/std": 0.5960803627967834,
"advantages/var": 0.355311798911945,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.23906083244397,
"grad_norm": 0.24347332154396464,
"learning_rate": 8.580149053620674e-07,
"loss": 0.0,
"num_tokens": 47158417.0,
"reward": 0.67578125,
"reward_std": 0.18804985284805298,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 290
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.8167024633676336e-09,
"advantages/std": 0.49596428871154785,
"advantages/var": 0.2459805756771516,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.2433297758804696,
"grad_norm": 0.19247935669529417,
"learning_rate": 8.570804137704003e-07,
"loss": 0.0,
"num_tokens": 47314092.0,
"reward": 0.68359375,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 291
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.6615166122208e-09,
"advantages/std": 0.4374021887779236,
"advantages/var": 0.1913206747477183,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 1.247598719316969,
"grad_norm": 0.1410680828176049,
"learning_rate": 8.561433695374848e-07,
"loss": 0.0,
"num_tokens": 47475514.0,
"reward": 0.80078125,
"reward_std": 0.10061002522706985,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 292
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.6363810130421453e-09,
"advantages/std": 0.6402812004089355,
"advantages/var": 0.4099600155971075,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.2518676627534684,
"grad_norm": 0.29802936592619206,
"learning_rate": 8.552037793619175e-07,
"loss": -0.0,
"num_tokens": 47642570.0,
"reward": 0.74609375,
"reward_std": 0.20043008029460907,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 293
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.906044476527982e-09,
"advantages/std": 0.5960777997970581,
"advantages/var": 0.3553087434109017,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.256136606189968,
"grad_norm": 0.27625707603096233,
"learning_rate": 8.542616499604957e-07,
"loss": 0.0,
"num_tokens": 47807632.0,
"reward": 0.6640625,
"reward_std": 0.18569329380989075,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 294
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.2250712129826735e-09,
"advantages/std": 0.4959622323513031,
"advantages/var": 0.24597853591888796,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.2604055496264674,
"grad_norm": 0.23483656229841748,
"learning_rate": 8.533169880681681e-07,
"loss": 0.0,
"num_tokens": 47959296.0,
"reward": 0.7578125,
"reward_std": 0.1244145929813385,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 295
},
{
"advantages/mean": -5.587935447692871e-09,
"advantages/snr": 8.197904556506459e-09,
"advantages/std": 0.6816297173500061,
"advantages/var": 0.4646190715746492,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 1.264674493062967,
"grad_norm": 0.24304630329493054,
"learning_rate": 8.523698004379875e-07,
"loss": 0.0,
"num_tokens": 48133667.0,
"reward": 0.68359375,
"reward_std": 0.22594210505485535,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 296
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.8778028834468404e-09,
"advantages/std": 0.4959639608860016,
"advantages/var": 0.2459802504977313,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.2689434364994665,
"grad_norm": 0.19984218005460322,
"learning_rate": 8.514200938410627e-07,
"loss": -0.0,
"num_tokens": 48299805.0,
"reward": 0.6796875,
"reward_std": 0.1250636875629425,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 297
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.2524389602364913e-09,
"advantages/std": 0.572691798210144,
"advantages/var": 0.32797589573716834,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.2732123799359658,
"grad_norm": 0.24154623696060273,
"learning_rate": 8.504678750665093e-07,
"loss": -0.0,
"num_tokens": 48459104.0,
"reward": 0.6484375,
"reward_std": 0.17005029320716858,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 298
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.8786360989624384e-09,
"advantages/std": 0.5726944208145142,
"advantages/var": 0.32797889963207183,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 1.2774813233724653,
"grad_norm": 0.23284752992750668,
"learning_rate": 8.495131509214013e-07,
"loss": 0.0,
"num_tokens": 48614547.0,
"reward": 0.65234375,
"reward_std": 0.172406867146492,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 299
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.226759654792445e-09,
"advantages/std": 0.5228014588356018,
"advantages/var": 0.27332136536063345,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.2817502668089649,
"grad_norm": 0.2681174827752255,
"learning_rate": 8.485559282307235e-07,
"loss": 0.0,
"num_tokens": 48774856.0,
"reward": 0.72265625,
"reward_std": 0.14913272857666016,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 300
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.7814442823074754e-09,
"advantages/std": 0.5227907299995422,
"advantages/var": 0.27331014737345427,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 1.2860192102454642,
"grad_norm": 0.238237970833212,
"learning_rate": 8.475962138373212e-07,
"loss": -0.0,
"num_tokens": 48928475.0,
"reward": 0.76171875,
"reward_std": 0.13835011422634125,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 301
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.528019770580436e-10,
"advantages/std": 0.6185707449913025,
"advantages/var": 0.382629766559095,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 1.2902881536819637,
"grad_norm": 0.24635422188933434,
"learning_rate": 8.466340146018522e-07,
"loss": 0.0,
"num_tokens": 49094017.0,
"reward": 0.71484375,
"reward_std": 0.189907044172287,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 302
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.493818094524657e-09,
"advantages/std": 0.46758833527565,
"advantages/var": 0.2186388512858537,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.2945570971184632,
"grad_norm": 0.15612960075723187,
"learning_rate": 8.456693374027378e-07,
"loss": -0.0,
"num_tokens": 49260136.0,
"reward": 0.68359375,
"reward_std": 0.10087842494249344,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 303
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 4.599761390615809e-09,
"advantages/std": 0.4049438536167145,
"advantages/var": 0.16397952458195508,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.2988260405549625,
"grad_norm": 0.22303969723586534,
"learning_rate": 8.44702189136113e-07,
"loss": -0.0,
"num_tokens": 49389252.0,
"reward": 0.8046875,
"reward_std": 0.0765409916639328,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 304
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.85536048412357e-09,
"advantages/std": 0.4374004006385803,
"advantages/var": 0.19131911047879058,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 1.303094983991462,
"grad_norm": 0.22184859832120044,
"learning_rate": 8.437325767157781e-07,
"loss": 0.0,
"num_tokens": 49534956.0,
"reward": 0.74609375,
"reward_std": 0.09837214648723602,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 305
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.953029854679673e-09,
"advantages/std": 0.5960754752159119,
"advantages/var": 0.35530597215387516,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.3073639274279616,
"grad_norm": 0.29883705651901826,
"learning_rate": 8.427605070731481e-07,
"loss": 0.0,
"num_tokens": 49695896.0,
"reward": 0.62890625,
"reward_std": 0.1817479282617569,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 306
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.102843969045556e-09,
"advantages/std": 0.49596521258354187,
"advantages/var": 0.24598149209303788,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.3116328708644611,
"grad_norm": 0.19332884739544304,
"learning_rate": 8.417859871572044e-07,
"loss": 0.0,
"num_tokens": 49850166.0,
"reward": 0.72265625,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 307
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.49285019678803e-09,
"advantages/std": 0.5482980012893677,
"advantages/var": 0.30063069821791544,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.3159018143009606,
"grad_norm": 0.2742934910268536,
"learning_rate": 8.408090239344441e-07,
"loss": 0.0,
"num_tokens": 50005479.0,
"reward": 0.69921875,
"reward_std": 0.14032843708992004,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 308
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.022409433040476e-09,
"advantages/std": 0.618571400642395,
"advantages/var": 0.3826305776926944,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.32017075773746,
"grad_norm": 0.29720642123032165,
"learning_rate": 8.39829624388831e-07,
"loss": 0.0,
"num_tokens": 50164299.0,
"reward": 0.66015625,
"reward_std": 0.18937906622886658,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 309
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 8.907327028325072e-10,
"advantages/std": 0.5227845311164856,
"advantages/var": 0.2733036659746837,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.3244397011739595,
"grad_norm": 0.21448321446048993,
"learning_rate": 8.38847795521745e-07,
"loss": 0.0,
"num_tokens": 50316273.0,
"reward": 0.71484375,
"reward_std": 0.13098734617233276,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 310
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.252507001666207e-09,
"advantages/std": 0.5726798176765442,
"advantages/var": 0.3279621735740399,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.328708644610459,
"grad_norm": 0.24406260372773594,
"learning_rate": 8.378635443519326e-07,
"loss": 0.0,
"num_tokens": 50469383.0,
"reward": 0.734375,
"reward_std": 0.15585274994373322,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 311
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.252496846048193e-09,
"advantages/std": 0.5726816058158875,
"advantages/var": 0.3279642216398635,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 1.3329775880469583,
"grad_norm": 0.27648193971156826,
"learning_rate": 8.368768779154562e-07,
"loss": 0.0,
"num_tokens": 50634882.0,
"reward": 0.62890625,
"reward_std": 0.15703225135803223,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 312
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.225127068226113e-09,
"advantages/std": 0.4959556758403778,
"advantages/var": 0.2459720323982859,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 1.3372465314834578,
"grad_norm": 0.23643394395444925,
"learning_rate": 8.358878032656445e-07,
"loss": 0.0,
"num_tokens": 50788065.0,
"reward": 0.75,
"reward_std": 0.11705183982849121,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 313
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.065633752082759e-10,
"advantages/std": 0.5726798176765442,
"advantages/var": 0.3279621735740399,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 1.3415154749199574,
"grad_norm": 0.2949257223504475,
"learning_rate": 8.348963274730412e-07,
"loss": 0.0,
"num_tokens": 50960166.0,
"reward": 0.5703125,
"reward_std": 0.15585274994373322,
"rewards/drgrpo_math_reward/mean": 0.5703125,
"rewards/drgrpo_math_reward/std": 0.4960011839866638,
"step": 314
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5227879881858826,
"advantages/var": 0.2733072805914425,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 1.3457844183564567,
"grad_norm": 0.21650235168913853,
"learning_rate": 8.339024576253553e-07,
"loss": -0.0,
"num_tokens": 51138096.0,
"reward": 0.6484375,
"reward_std": 0.1344047486782074,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 315
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.859100109905457e-09,
"advantages/std": 0.5960744023323059,
"advantages/var": 0.3553046931158157,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.3500533617929562,
"grad_norm": 0.2614104176232604,
"learning_rate": 8.329062008274098e-07,
"loss": 0.0,
"num_tokens": 51297255.0,
"reward": 0.75,
"reward_std": 0.18004046380519867,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 316
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755680466206912e-09,
"advantages/std": 0.49595409631729126,
"advantages/var": 0.24597046565390102,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 1.3543223052294557,
"grad_norm": 0.22619377392286,
"learning_rate": 8.319075642010913e-07,
"loss": 0.0,
"num_tokens": 51455445.0,
"reward": 0.6953125,
"reward_std": 0.11481393873691559,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 317
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.0646357329349468e-09,
"advantages/std": 0.4373902380466461,
"advantages/var": 0.19131022033850176,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 1.358591248665955,
"grad_norm": 0.21774584926886661,
"learning_rate": 8.309065548852989e-07,
"loss": -0.0,
"num_tokens": 51597160.0,
"reward": 0.71484375,
"reward_std": 0.08929947018623352,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 318
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.489578768976071e-09,
"advantages/std": 0.46761050820350647,
"advantages/var": 0.2186595873823416,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 1.3628601921024546,
"grad_norm": 0.21396670829968895,
"learning_rate": 8.299031800358931e-07,
"loss": 0.0,
"num_tokens": 51734808.0,
"reward": 0.7890625,
"reward_std": 0.12138034403324127,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 319
},
{
"advantages/mean": 6.05359673500061e-09,
"advantages/snr": 9.786257849423335e-09,
"advantages/std": 0.6185813546180725,
"advantages/var": 0.3826428922811296,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 1.367129135538954,
"grad_norm": 0.24862330990714387,
"learning_rate": 8.288974468256451e-07,
"loss": -0.0,
"num_tokens": 51904309.0,
"reward": 0.60546875,
"reward_std": 0.20304375886917114,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 320
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5624062294134083e-09,
"advantages/std": 0.5960822105407715,
"advantages/var": 0.3553140017231726,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.3713980789754536,
"grad_norm": 0.273560968050573,
"learning_rate": 8.278893624441847e-07,
"loss": 0.0,
"num_tokens": 52076956.0,
"reward": 0.68359375,
"reward_std": 0.1913485825061798,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 321
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.812061612278906e-10,
"advantages/std": 0.5960798859596252,
"advantages/var": 0.35531123044563984,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.3756670224119532,
"grad_norm": 0.27764714587091543,
"learning_rate": 8.268789340979498e-07,
"loss": -0.0,
"num_tokens": 52236966.0,
"reward": 0.71875,
"reward_std": 0.18740323185920715,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 322
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.193883478155236e-09,
"advantages/std": 0.4373934864997864,
"advantages/var": 0.1913130620324388,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.3799359658484525,
"grad_norm": 0.18734396601228684,
"learning_rate": 8.258661690101345e-07,
"loss": 0.0,
"num_tokens": 52380779.0,
"reward": 0.6640625,
"reward_std": 0.09324482083320618,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 323
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.618571400642395,
"advantages/var": 0.3826305776926944,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.384204909284952,
"grad_norm": 0.28559951594842736,
"learning_rate": 8.248510744206369e-07,
"loss": -0.0,
"num_tokens": 52553345.0,
"reward": 0.57421875,
"reward_std": 0.18937908113002777,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 324
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983447384004837e-09,
"advantages/std": 0.46759626269340515,
"advantages/var": 0.21864626488483996,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.3884738527214515,
"grad_norm": 0.18535459393670384,
"learning_rate": 8.238336575860083e-07,
"loss": 0.0,
"num_tokens": 52697338.0,
"reward": 0.83984375,
"reward_std": 0.10941824316978455,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"step": 325
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.907286406187238e-09,
"advantages/std": 0.5227869153022766,
"advantages/var": 0.27330615881126974,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.3927427961579508,
"grad_norm": 0.22524115569862238,
"learning_rate": 8.228139257794012e-07,
"loss": 0.0,
"num_tokens": 52862363.0,
"reward": 0.73828125,
"reward_std": 0.1344023048877716,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 326
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 5.285326628197485e-09,
"advantages/std": 0.5726795196533203,
"advantages/var": 0.3279618322303577,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.3970117395944504,
"grad_norm": 0.2536769347881268,
"learning_rate": 8.217918862905162e-07,
"loss": -0.0,
"num_tokens": 53016775.0,
"reward": 0.77734375,
"reward_std": 0.15532232820987701,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 327
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5624340390702578e-09,
"advantages/std": 0.5960716009140015,
"advantages/var": 0.35530135341618063,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.515625,
"epoch": 1.4012806830309499,
"grad_norm": 0.33070974810083065,
"learning_rate": 8.207675464255516e-07,
"loss": -0.0,
"num_tokens": 53200673.0,
"reward": 0.5546875,
"reward_std": 0.17715348303318024,
"rewards/drgrpo_math_reward/mean": 0.5546875,
"rewards/drgrpo_math_reward/std": 0.49797385931015015,
"step": 328
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.131157486115432e-10,
"advantages/std": 0.572687566280365,
"advantages/var": 0.32797104857212744,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.4055496264674492,
"grad_norm": 0.33652698710835777,
"learning_rate": 8.197409135071496e-07,
"loss": 0.0,
"num_tokens": 53348505.0,
"reward": 0.7890625,
"reward_std": 0.16492542624473572,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 329
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.9917200106673123e-09,
"advantages/std": 0.4675971269607544,
"advantages/var": 0.21864707314195186,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.4098185699039487,
"grad_norm": 0.2079452368372239,
"learning_rate": 8.187119948743449e-07,
"loss": -0.0,
"num_tokens": 53509742.0,
"reward": 0.66796875,
"reward_std": 0.10889027267694473,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 330
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.979264165711202e-10,
"advantages/std": 0.4676004946231842,
"advantages/var": 0.21865022257184652,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 1.4140875133404482,
"grad_norm": 0.19778389967748286,
"learning_rate": 8.176807978825118e-07,
"loss": 0.0,
"num_tokens": 53673940.0,
"reward": 0.74609375,
"reward_std": 0.113366037607193,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 331
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.33065417408943176,
"advantages/var": 0.10933218284276425,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 1.4183564567769478,
"grad_norm": 0.19611606356891836,
"learning_rate": 8.16647329903312e-07,
"loss": -0.0,
"num_tokens": 53826053.0,
"reward": 0.65234375,
"reward_std": 0.06404700875282288,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 332
},
{
"advantages/mean": -6.28642737865448e-09,
"advantages/snr": 1.1465324084482167e-08,
"advantages/std": 0.5482991337776184,
"advantages/var": 0.3006319401012867,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 1.4226254002134473,
"grad_norm": 0.3003562797692729,
"learning_rate": 8.156115983246419e-07,
"loss": 0.0,
"num_tokens": 53978781.0,
"reward": 0.75,
"reward_std": 0.14203590154647827,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 333
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 7.421619571772616e-09,
"advantages/std": 0.596066951751709,
"advantages/var": 0.35529581097057417,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.4268943436499466,
"grad_norm": 0.2821425443858336,
"learning_rate": 8.145736105505787e-07,
"loss": 0.0,
"num_tokens": 54130148.0,
"reward": 0.7734375,
"reward_std": 0.17096778750419617,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 334
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.51129367972865e-09,
"advantages/std": 0.495958536863327,
"advantages/var": 0.2459748702876121,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 1.4311632870864461,
"grad_norm": 0.23600341276988868,
"learning_rate": 8.135333740013294e-07,
"loss": 0.0,
"num_tokens": 54296752.0,
"reward": 0.64453125,
"reward_std": 0.12099719047546387,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 335
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983355987268271e-09,
"advantages/std": 0.4676069915294647,
"advantages/var": 0.2186562985272369,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.4354322305229457,
"grad_norm": 0.1762747193777988,
"learning_rate": 8.124908961131757e-07,
"loss": 0.0,
"num_tokens": 54461352.0,
"reward": 0.765625,
"reward_std": 0.11849336326122284,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 336
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.090908300523042e-09,
"advantages/std": 0.6402843594551086,
"advantages/var": 0.40996406096283877,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.439701173959445,
"grad_norm": 0.24480847749808787,
"learning_rate": 8.114461843384228e-07,
"loss": 0.0,
"num_tokens": 54623714.0,
"reward": 0.79296875,
"reward_std": 0.20608291029930115,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 337
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.0246066868757e-09,
"advantages/std": 0.404962420463562,
"advantages/var": 0.1639945619877068,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.4439701173959445,
"grad_norm": 0.19200558195614426,
"learning_rate": 8.103992461453445e-07,
"loss": 0.0,
"num_tokens": 54760860.0,
"reward": 0.74609375,
"reward_std": 0.09244601428508759,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 338
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5726826786994934,
"advantages/var": 0.3279654504824272,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.448239060832444,
"grad_norm": 0.2429698349876259,
"learning_rate": 8.093500890181307e-07,
"loss": 0.0,
"num_tokens": 54927519.0,
"reward": 0.6328125,
"reward_std": 0.15873971581459045,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 339
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 2.816677681974272e-09,
"advantages/std": 0.6612915396690369,
"advantages/var": 0.43730650043784536,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.4525080042689433,
"grad_norm": 0.29709285080753056,
"learning_rate": 8.082987204568335e-07,
"loss": -0.0,
"num_tokens": 55096190.0,
"reward": 0.67578125,
"reward_std": 0.23185941576957703,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 340
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 5.323084000677362e-09,
"advantages/std": 0.4373980164527893,
"advantages/var": 0.19131702479683454,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.4567769477054429,
"grad_norm": 0.2037369960547768,
"learning_rate": 8.072451479773143e-07,
"loss": 0.0,
"num_tokens": 55262990.0,
"reward": 0.703125,
"reward_std": 0.09719263017177582,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 341
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.9530466500626467e-09,
"advantages/std": 0.5960703492164612,
"advantages/var": 0.355299861215034,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.4610458911419424,
"grad_norm": 0.2637448981496169,
"learning_rate": 8.061893791111886e-07,
"loss": 0.0,
"num_tokens": 55413687.0,
"reward": 0.71875,
"reward_std": 0.17662061750888824,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 342
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.453634063173048e-09,
"advantages/std": 0.5227879881858826,
"advantages/var": 0.2733072805914425,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.465314834578442,
"grad_norm": 0.2520712775108035,
"learning_rate": 8.05131421405774e-07,
"loss": -0.0,
"num_tokens": 55573021.0,
"reward": 0.75,
"reward_std": 0.1344047486782074,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 343
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 1.1336897468247492e-08,
"advantages/std": 0.36967357993125916,
"advantages/var": 0.13665855569919305,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.4695837780149412,
"grad_norm": 0.167234859844493,
"learning_rate": 8.040712824240347e-07,
"loss": 0.0,
"num_tokens": 55724996.0,
"reward": 0.76953125,
"reward_std": 0.07232724130153656,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 344
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.970936510266691e-09,
"advantages/std": 0.4676027297973633,
"advantages/var": 0.21865231291394593,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.4738527214514408,
"grad_norm": 0.1888848391325854,
"learning_rate": 8.030089697445286e-07,
"loss": 0.0,
"num_tokens": 55891954.0,
"reward": 0.640625,
"reward_std": 0.1145455539226532,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 345
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.2268017985210387e-09,
"advantages/std": 0.5227915644645691,
"advantages/var": 0.2733110198753117,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.4781216648879403,
"grad_norm": 0.23996629606975922,
"learning_rate": 8.019444909613521e-07,
"loss": -0.0,
"num_tokens": 56047635.0,
"reward": 0.76171875,
"reward_std": 0.13952717185020447,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 346
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.687330240004959e-09,
"advantages/std": 0.5960680246353149,
"advantages/var": 0.3552970899926464,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.4823906083244398,
"grad_norm": 0.2624983197756317,
"learning_rate": 8.008778536840867e-07,
"loss": 0.0,
"num_tokens": 56211206.0,
"reward": 0.59765625,
"reward_std": 0.1726752519607544,
"rewards/drgrpo_math_reward/mean": 0.59765625,
"rewards/drgrpo_math_reward/std": 0.4913311004638672,
"step": 347
},
{
"advantages/mean": 4.190951585769653e-09,
"advantages/snr": 7.643456328090667e-09,
"advantages/std": 0.5483058094978333,
"advantages/var": 0.3006392607290742,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.4866595517609391,
"grad_norm": 0.23557605254214864,
"learning_rate": 7.998090655377441e-07,
"loss": 0.0,
"num_tokens": 56390588.0,
"reward": 0.578125,
"reward_std": 0.14887069165706635,
"rewards/drgrpo_math_reward/mean": 0.578125,
"rewards/drgrpo_math_reward/std": 0.49482619762420654,
"step": 348
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.7897975545984724e-09,
"advantages/std": 0.5227813720703125,
"advantages/var": 0.2733003629837185,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 1.4909284951974386,
"grad_norm": 0.21543497640620152,
"learning_rate": 7.987381341627116e-07,
"loss": 0.0,
"num_tokens": 56573406.0,
"reward": 0.64453125,
"reward_std": 0.12810038030147552,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 349
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.878720895786924e-09,
"advantages/std": 0.5726844668388367,
"advantages/var": 0.3279674985584826,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.4951974386339382,
"grad_norm": 0.26031703247446014,
"learning_rate": 7.976650672146976e-07,
"loss": 0.0,
"num_tokens": 56734640.0,
"reward": 0.65234375,
"reward_std": 0.15991923213005066,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 350
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.545457467686302e-09,
"advantages/std": 0.6402835249900818,
"advantages/var": 0.4099629923737247,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.4994663820704375,
"grad_norm": 0.26914821743698814,
"learning_rate": 7.965898723646776e-07,
"loss": 0.0,
"num_tokens": 56899076.0,
"reward": 0.671875,
"reward_std": 0.20608046650886536,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 351
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.1939374485847554e-09,
"advantages/std": 0.43738609552383423,
"advantages/var": 0.19130659655758464,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.503735325506937,
"grad_norm": 0.17746551797009705,
"learning_rate": 7.955125572988381e-07,
"loss": 0.0,
"num_tokens": 57044649.0,
"reward": 0.796875,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 352
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.694597197528499e-10,
"advantages/std": 0.4959544539451599,
"advantages/var": 0.24597082038804174,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.5080042689434365,
"grad_norm": 0.2406487583945076,
"learning_rate": 7.944331297185222e-07,
"loss": -0.0,
"num_tokens": 57190732.0,
"reward": 0.66015625,
"reward_std": 0.11534436792135239,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 353
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.562954778661877e-09,
"advantages/std": 0.5227810144424438,
"advantages/var": 0.2732999890614707,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 1.5122732123799358,
"grad_norm": 0.260133206173505,
"learning_rate": 7.933515973401754e-07,
"loss": 0.0,
"num_tokens": 57351877.0,
"reward": 0.6953125,
"reward_std": 0.12756997346878052,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 354
},
{
"advantages/mean": 3.725290298461914e-09,
"advantages/snr": 7.125665828442654e-09,
"advantages/std": 0.5227988958358765,
"advantages/var": 0.2733186854872116,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.5165421558164356,
"grad_norm": 0.25786833797545927,
"learning_rate": 7.922679678952888e-07,
"loss": -0.0,
"num_tokens": 57501527.0,
"reward": 0.7421875,
"reward_std": 0.14689236879348755,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 355
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755686559635449e-09,
"advantages/std": 0.4959532916545868,
"advantages/var": 0.24596966750301963,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 1.520811099252935,
"grad_norm": 0.23675947330341743,
"learning_rate": 7.911822491303452e-07,
"loss": -0.0,
"num_tokens": 57632583.0,
"reward": 0.8125,
"reward_std": 0.1153419092297554,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"step": 356
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5228006839752197,
"advantages/var": 0.27332055516495757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.5250800426894342,
"grad_norm": 0.246557803675931,
"learning_rate": 7.900944488067628e-07,
"loss": -0.0,
"num_tokens": 57791374.0,
"reward": 0.63671875,
"reward_std": 0.14966067671775818,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 357
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.00822312992985e-09,
"advantages/std": 0.5227941870689392,
"advantages/var": 0.273313762033073,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 1.529348986125934,
"grad_norm": 0.25317587277774867,
"learning_rate": 7.890045747008405e-07,
"loss": 0.0,
"num_tokens": 57939727.0,
"reward": 0.765625,
"reward_std": 0.1417675018310547,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 358
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.489676036788974e-09,
"advantages/std": 0.4675922393798828,
"advantages/var": 0.21864250232829363,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 1.5336179295624333,
"grad_norm": 0.22174911867779942,
"learning_rate": 7.879126346037018e-07,
"loss": -0.0,
"num_tokens": 58074464.0,
"reward": 0.78125,
"reward_std": 0.10429581999778748,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 359
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.3876010625689843e-09,
"advantages/std": 0.6185721755027771,
"advantages/var": 0.3826315363062385,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.5378868729989328,
"grad_norm": 0.33116736798190893,
"learning_rate": 7.86818636321239e-07,
"loss": 0.0,
"num_tokens": 58247849.0,
"reward": 0.66796875,
"reward_std": 0.190556138753891,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 360
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 2.2998135000829877e-09,
"advantages/std": 0.4049556851387024,
"advantages/var": 0.16398910692615587,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.5421558164354323,
"grad_norm": 0.18075330496000655,
"learning_rate": 7.857225876740583e-07,
"loss": -0.0,
"num_tokens": 58385344.0,
"reward": 0.7734375,
"reward_std": 0.0866745114326477,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 361
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.226817031586524e-09,
"advantages/std": 0.5227879881858826,
"advantages/var": 0.2733072805914425,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.5464247598719316,
"grad_norm": 0.2604824454603905,
"learning_rate": 7.846244964974224e-07,
"loss": 0.0,
"num_tokens": 58548454.0,
"reward": 0.765625,
"reward_std": 0.1344047486782074,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 362
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.94480707872331e-09,
"advantages/std": 0.5483153462409973,
"advantages/var": 0.30064971892338477,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.5506937033084311,
"grad_norm": 0.3140737111444032,
"learning_rate": 7.83524370641196e-07,
"loss": 0.0,
"num_tokens": 58705975.0,
"reward": 0.7421875,
"reward_std": 0.16018126904964447,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 363
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.34441632512063e-09,
"advantages/std": 0.522782564163208,
"advantages/var": 0.2733016093930587,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.53125,
"epoch": 1.5549626467449307,
"grad_norm": 0.19446400216043178,
"learning_rate": 7.824222179697884e-07,
"loss": 0.0,
"num_tokens": 58874863.0,
"reward": 0.6484375,
"reward_std": 0.12980784475803375,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 364
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.9724352049941504e-09,
"advantages/std": 0.5483095049858093,
"advantages/var": 0.30064331325778326,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.55923159018143,
"grad_norm": 0.19758478072310584,
"learning_rate": 7.813180463620985e-07,
"loss": 0.0,
"num_tokens": 59023297.0,
"reward": 0.75,
"reward_std": 0.15452352166175842,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 365
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.7640222169579895e-09,
"advantages/std": 0.6185687184333801,
"advantages/var": 0.3826272594243143,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5625,
"epoch": 1.5635005336179295,
"grad_norm": 0.28858573770761964,
"learning_rate": 7.802118637114573e-07,
"loss": -0.0,
"num_tokens": 59207565.0,
"reward": 0.58984375,
"reward_std": 0.18649210035800934,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 366
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 5.7896985381826545e-09,
"advantages/std": 0.5227903127670288,
"advantages/var": 0.2733097111230478,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 1.567769477054429,
"grad_norm": 0.30868263813123614,
"learning_rate": 7.791036779255726e-07,
"loss": 0.0,
"num_tokens": 59343474.0,
"reward": 0.7109375,
"reward_std": 0.1361146867275238,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 367
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.4721687339672804e-09,
"advantages/std": 0.5726834535598755,
"advantages/var": 0.32796633798126607,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.5720384204909283,
"grad_norm": 0.22317988188778135,
"learning_rate": 7.779934969264712e-07,
"loss": -0.0,
"num_tokens": 59505174.0,
"reward": 0.734375,
"reward_std": 0.15991678833961487,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 368
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 8.461828504089582e-09,
"advantages/std": 0.5227926969528198,
"advantages/var": 0.2733122039872029,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 1.576307363927428,
"grad_norm": 0.26201326633264177,
"learning_rate": 7.768813286504438e-07,
"loss": 0.0,
"num_tokens": 59669277.0,
"reward": 0.6328125,
"reward_std": 0.13952963054180145,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 369
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 3.1175139874944796e-09,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.5805763073639274,
"grad_norm": 0.21229764390237946,
"learning_rate": 7.757671810479864e-07,
"loss": 0.0,
"num_tokens": 59824332.0,
"reward": 0.66015625,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 370
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.2267555928306024e-09,
"advantages/std": 0.5228024125099182,
"advantages/var": 0.2733223625261907,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 1.584845250800427,
"grad_norm": 0.21745259260943325,
"learning_rate": 7.746510620837458e-07,
"loss": -0.0,
"num_tokens": 59987591.0,
"reward": 0.73828125,
"reward_std": 0.15030977129936218,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 371
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.07788625415326e-09,
"advantages/std": 0.5960744619369507,
"advantages/var": 0.35530476417342527,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.5891141942369265,
"grad_norm": 0.265702997455518,
"learning_rate": 7.735329797364605e-07,
"loss": 0.0,
"num_tokens": 60145919.0,
"reward": 0.7265625,
"reward_std": 0.1817454695701599,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 372
},
{
"advantages/mean": -6.984919309616089e-09,
"advantages/snr": 1.1291838156199155e-08,
"advantages/std": 0.618581235408783,
"advantages/var": 0.38264274479985616,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 1.5933831376734258,
"grad_norm": 0.26508239510597087,
"learning_rate": 7.724129419989043e-07,
"loss": 0.0,
"num_tokens": 60318193.0,
"reward": 0.72265625,
"reward_std": 0.2013387382030487,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 373
},
{
"advantages/mean": -7.683411240577698e-09,
"advantages/snr": 1.4012945382601492e-08,
"advantages/std": 0.5483080744743347,
"advantages/var": 0.3006417445337526,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.5976520811099253,
"grad_norm": 0.2546758923492174,
"learning_rate": 7.712909568778301e-07,
"loss": -0.0,
"num_tokens": 60494194.0,
"reward": 0.625,
"reward_std": 0.152285635471344,
"rewards/drgrpo_math_reward/mean": 0.625,
"rewards/drgrpo_math_reward/std": 0.4850712716579437,
"step": 374
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 4.898950551777039e-09,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.6019210245464248,
"grad_norm": 0.22292132323131664,
"learning_rate": 7.701670323939116e-07,
"loss": -0.0,
"num_tokens": 60656586.0,
"reward": 0.67578125,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 375
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.1248730777263736e-09,
"advantages/std": 0.5960706472396851,
"advantages/var": 0.35530021650073706,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.6061899679829241,
"grad_norm": 0.29603461478600207,
"learning_rate": 7.690411765816864e-07,
"loss": -0.0,
"num_tokens": 60806490.0,
"reward": 0.69921875,
"reward_std": 0.17715102434158325,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 376
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 6.775109422012961e-09,
"advantages/std": 0.6185806393623352,
"advantages/var": 0.3826420073939154,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.6104589114194237,
"grad_norm": 0.32902896731621456,
"learning_rate": 7.679133974894982e-07,
"loss": 0.0,
"num_tokens": 60963003.0,
"reward": 0.69921875,
"reward_std": 0.20186668634414673,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 377
},
{
"advantages/mean": -5.122274160385132e-09,
"advantages/snr": 8.280908722794443e-09,
"advantages/std": 0.618564248085022,
"advantages/var": 0.3826217290089886,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.6147278548559232,
"grad_norm": 0.25689944731144304,
"learning_rate": 7.667837031794403e-07,
"loss": -0.0,
"num_tokens": 61120387.0,
"reward": 0.73828125,
"reward_std": 0.18030640482902527,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 378
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.272710617665313e-09,
"advantages/std": 0.6402875185012817,
"advantages/var": 0.4099681063485292,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.6189967982924225,
"grad_norm": 0.2899400646000739,
"learning_rate": 7.656521017272963e-07,
"loss": 0.0,
"num_tokens": 61304243.0,
"reward": 0.51953125,
"reward_std": 0.21003073453903198,
"rewards/drgrpo_math_reward/mean": 0.51953125,
"rewards/drgrpo_math_reward/std": 0.5005971193313599,
"step": 379
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.492624931018031e-10,
"advantages/std": 0.5483125448226929,
"advantages/var": 0.3006466468099376,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.6232657417289222,
"grad_norm": 0.22626008132108802,
"learning_rate": 7.645186012224838e-07,
"loss": 0.0,
"num_tokens": 61448061.0,
"reward": 0.796875,
"reward_std": 0.15741050243377686,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 380
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 5.17467405816827e-09,
"advantages/std": 0.404948353767395,
"advantages/var": 0.1639831692189233,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.6275346851654215,
"grad_norm": 0.19854078916882764,
"learning_rate": 7.633832097679957e-07,
"loss": -0.0,
"num_tokens": 61598324.0,
"reward": 0.71484375,
"reward_std": 0.07995839416980743,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 381
},
{
"advantages/mean": 5.3551048040390015e-09,
"advantages/snr": 1.0243059377680273e-08,
"advantages/std": 0.5228032469749451,
"advantages/var": 0.2733232350475454,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.631803628601921,
"grad_norm": 0.20930490430516044,
"learning_rate": 7.622459354803434e-07,
"loss": -0.0,
"num_tokens": 61758043.0,
"reward": 0.67578125,
"reward_std": 0.1514868289232254,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 382
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.090890540632927e-09,
"advantages/std": 0.640295147895813,
"advantages/var": 0.40997787641892103,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 1.6360725720384206,
"grad_norm": 0.2816231565987675,
"learning_rate": 7.611067864894971e-07,
"loss": -0.0,
"num_tokens": 61915425.0,
"reward": 0.62109375,
"reward_std": 0.2202804535627365,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 383
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.907327028325072e-10,
"advantages/std": 0.5227845311164856,
"advantages/var": 0.2733036659746837,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.64034151547492,
"grad_norm": 0.233848130807371,
"learning_rate": 7.59965770938829e-07,
"loss": -0.0,
"num_tokens": 62058323.0,
"reward": 0.78515625,
"reward_std": 0.13098736107349396,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 384
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.794348108931506e-09,
"advantages/std": 0.5482925176620483,
"advantages/var": 0.3006246849241876,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.6446104589114194,
"grad_norm": 0.2823718727784672,
"learning_rate": 7.588228969850548e-07,
"loss": -0.0,
"num_tokens": 62212329.0,
"reward": 0.7265625,
"reward_std": 0.1352011114358902,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 385
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.065573242237083e-10,
"advantages/std": 0.5726883411407471,
"advantages/var": 0.3279719360785407,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.546875,
"epoch": 1.648879402347919,
"grad_norm": 0.28800478049676365,
"learning_rate": 7.576781727981749e-07,
"loss": 0.0,
"num_tokens": 62382068.0,
"reward": 0.6171875,
"reward_std": 0.16610248386859894,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 386
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.4646680582978397e-09,
"advantages/std": 0.6612713932991028,
"advantages/var": 0.4372798555957367,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.6531483457844183,
"grad_norm": 0.27720319557160134,
"learning_rate": 7.565316065614167e-07,
"loss": 0.0,
"num_tokens": 62530940.0,
"reward": 0.74609375,
"reward_std": 0.20464137196540833,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 387
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.9724914293101917e-09,
"advantages/std": 0.5482991337776184,
"advantages/var": 0.3006319401012867,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 1.6574172892209178,
"grad_norm": 0.22444635976189184,
"learning_rate": 7.553832064711756e-07,
"loss": -0.0,
"num_tokens": 62687916.0,
"reward": 0.7421875,
"reward_std": 0.14203590154647827,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 388
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 6.7753000537295894e-09,
"advantages/std": 0.6185632348060608,
"advantages/var": 0.3826204754537379,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.6616862326574173,
"grad_norm": 0.2803792143949797,
"learning_rate": 7.542329807369565e-07,
"loss": 0.0,
"num_tokens": 62831060.0,
"reward": 0.8203125,
"reward_std": 0.17859894037246704,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 389
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.5969403245552258e-09,
"advantages/std": 0.4373938739299774,
"advantages/var": 0.19131340095147298,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 1.6659551760939166,
"grad_norm": 0.18879394263981103,
"learning_rate": 7.530809375813155e-07,
"loss": -0.0,
"num_tokens": 62993130.0,
"reward": 0.72265625,
"reward_std": 0.09377524256706238,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 390
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.4720812289421775e-09,
"advantages/std": 0.5726946592330933,
"advantages/var": 0.3279791727141088,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.6702241195304164,
"grad_norm": 0.24513909553197558,
"learning_rate": 7.519270852398001e-07,
"loss": -0.0,
"num_tokens": 63157176.0,
"reward": 0.7109375,
"reward_std": 0.17123225331306458,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 391
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.980860316001973e-09,
"advantages/std": 0.49595165252685547,
"advantages/var": 0.24596804164411878,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.6744930629669157,
"grad_norm": 0.24584737442182014,
"learning_rate": 7.507714319608921e-07,
"loss": 0.0,
"num_tokens": 63306499.0,
"reward": 0.6953125,
"reward_std": 0.11310401558876038,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 392
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.516781396180255e-09,
"advantages/std": 0.6185749173164368,
"advantages/var": 0.3826349283330366,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.678762006403415,
"grad_norm": 0.33599920290529195,
"learning_rate": 7.496139860059467e-07,
"loss": 0.0,
"num_tokens": 63483796.0,
"reward": 0.546875,
"reward_std": 0.1938573122024536,
"rewards/drgrpo_math_reward/mean": 0.546875,
"rewards/drgrpo_math_reward/std": 0.4987730085849762,
"step": 393
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.9449311575715845e-09,
"advantages/std": 0.5483039021492004,
"advantages/var": 0.30063716911203997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.6830309498399147,
"grad_norm": 0.23643434716983186,
"learning_rate": 7.484547556491345e-07,
"loss": 0.0,
"num_tokens": 63637886.0,
"reward": 0.73046875,
"reward_std": 0.14769117534160614,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 394
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.3694770070372925e-09,
"advantages/std": 0.5483118295669556,
"advantages/var": 0.30064586244306213,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.687299893276414,
"grad_norm": 0.263495448332587,
"learning_rate": 7.472937491773823e-07,
"loss": 0.0,
"num_tokens": 63799240.0,
"reward": 0.64453125,
"reward_std": 0.15835265815258026,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 395
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.13136821560598e-09,
"advantages/std": 0.5726727247238159,
"advantages/var": 0.32795404964259944,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.6915688367129136,
"grad_norm": 0.24958196804145655,
"learning_rate": 7.461309748903137e-07,
"loss": -0.0,
"num_tokens": 63958086.0,
"reward": 0.71875,
"reward_std": 0.1462520956993103,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 396
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.672133215912294e-09,
"advantages/std": 0.5227972269058228,
"advantages/var": 0.2733169404604183,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.695837780149413,
"grad_norm": 0.2619349062534,
"learning_rate": 7.449664411001897e-07,
"loss": -0.0,
"num_tokens": 64114337.0,
"reward": 0.6484375,
"reward_std": 0.14624328911304474,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 397
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.816697216482564e-09,
"advantages/std": 0.49596521258354187,
"advantages/var": 0.24598149209303788,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.7001067235859124,
"grad_norm": 0.25131872608530903,
"learning_rate": 7.438001561318494e-07,
"loss": -0.0,
"num_tokens": 64269432.0,
"reward": 0.69921875,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 398
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.098235462887327e-09,
"advantages/std": 0.572700023651123,
"advantages/var": 0.3279853170899969,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.704375667022412,
"grad_norm": 0.24340881384990734,
"learning_rate": 7.426321283226503e-07,
"loss": -0.0,
"num_tokens": 64426570.0,
"reward": 0.62890625,
"reward_std": 0.1797696202993393,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 399
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.4393045940057892e-09,
"advantages/std": 0.5726975798606873,
"advantages/var": 0.32798251797828826,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.7086446104589115,
"grad_norm": 0.26003316162281526,
"learning_rate": 7.414623660224093e-07,
"loss": 0.0,
"num_tokens": 64586322.0,
"reward": 0.7109375,
"reward_std": 0.17582425475120544,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 400
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.4937602088770592e-09,
"advantages/std": 0.46760645508766174,
"advantages/var": 0.21865579683964942,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.7129135538954108,
"grad_norm": 0.21370170671538638,
"learning_rate": 7.402908775933419e-07,
"loss": 0.0,
"num_tokens": 64735034.0,
"reward": 0.63671875,
"reward_std": 0.11955174803733826,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 401
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.658925283782654e-09,
"advantages/std": 0.5727025270462036,
"advantages/var": 0.3279881844851076,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 1.7171824973319103,
"grad_norm": 0.29177020368304496,
"learning_rate": 7.391176714100037e-07,
"loss": 0.0,
"num_tokens": 64874775.0,
"reward": 0.7421875,
"reward_std": 0.18371497094631195,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 402
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.1491582420370604e-09,
"advantages/std": 0.3696712255477905,
"advantages/var": 0.13665681499800542,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.7214514407684098,
"grad_norm": 0.16600266818960302,
"learning_rate": 7.379427558592295e-07,
"loss": -0.0,
"num_tokens": 65039992.0,
"reward": 0.51171875,
"reward_std": 0.07167815417051315,
"rewards/drgrpo_math_reward/mean": 0.51171875,
"rewards/drgrpo_math_reward/std": 0.5008418560028076,
"step": 403
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.691739153544274e-09,
"advantages/std": 0.572694718837738,
"advantages/var": 0.3279792409846358,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 1.7257203842049091,
"grad_norm": 0.2684923274817556,
"learning_rate": 7.36766139340074e-07,
"loss": -0.0,
"num_tokens": 65185373.0,
"reward": 0.7421875,
"reward_std": 0.172937273979187,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 404
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.2739001097361383e-09,
"advantages/std": 0.5483098030090332,
"advantages/var": 0.3006436400758048,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.729989327641409,
"grad_norm": 0.22906645703144418,
"learning_rate": 7.355878302637514e-07,
"loss": -0.0,
"num_tokens": 65340303.0,
"reward": 0.75390625,
"reward_std": 0.15505394339561462,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 405
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.7814463133777788e-09,
"advantages/std": 0.5227901339530945,
"advantages/var": 0.2733095241586945,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 1.7342582710779082,
"grad_norm": 0.23426357271085818,
"learning_rate": 7.344078370535755e-07,
"loss": -0.0,
"num_tokens": 65494084.0,
"reward": 0.6640625,
"reward_std": 0.13770347833633423,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 406
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.1291963283089938e-09,
"advantages/std": 0.6185743808746338,
"advantages/var": 0.3826342646744365,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.7385272145144077,
"grad_norm": 0.3011024733459177,
"learning_rate": 7.332261681448995e-07,
"loss": 0.0,
"num_tokens": 65648246.0,
"reward": 0.7421875,
"reward_std": 0.1927964836359024,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 407
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.95304215824588e-09,
"advantages/std": 0.596071720123291,
"advantages/var": 0.355301495530739,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.7427961579509073,
"grad_norm": 0.2925011236710639,
"learning_rate": 7.320428319850549e-07,
"loss": 0.0,
"num_tokens": 65828462.0,
"reward": 0.6015625,
"reward_std": 0.17885848879814148,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"step": 408
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.453650819722753e-10,
"advantages/std": 0.522786021232605,
"advantages/var": 0.2733052239962177,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.7470651013874066,
"grad_norm": 0.23509345702783163,
"learning_rate": 7.308578370332925e-07,
"loss": -0.0,
"num_tokens": 65987757.0,
"reward": 0.77734375,
"reward_std": 0.1332252472639084,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 409
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 5.323123171578266e-10,
"advantages/std": 0.43739479780197144,
"advantages/var": 0.19131420914422748,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 1.751334044823906,
"grad_norm": 0.18315104597430326,
"learning_rate": 7.29671191760721e-07,
"loss": 0.0,
"num_tokens": 66130350.0,
"reward": 0.69921875,
"reward_std": 0.09324727952480316,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 410
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 7.904269953437837e-09,
"advantages/std": 0.618582546710968,
"advantages/var": 0.38264436709542693,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.7556029882604056,
"grad_norm": 0.3246528040350126,
"learning_rate": 7.284829046502467e-07,
"loss": 0.0,
"num_tokens": 66288313.0,
"reward": 0.73046875,
"reward_std": 0.20357662439346313,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 411
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.2462903089234764e-10,
"advantages/std": 0.5483154058456421,
"advantages/var": 0.3006497842876712,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.759871931696905,
"grad_norm": 0.3002740331443812,
"learning_rate": 7.272929841965126e-07,
"loss": 0.0,
"num_tokens": 66438066.0,
"reward": 0.5625,
"reward_std": 0.1618862748146057,
"rewards/drgrpo_math_reward/mean": 0.5625,
"rewards/drgrpo_math_reward/std": 0.49705013632774353,
"step": 412
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.75557733228642e-09,
"advantages/std": 0.49596771597862244,
"advantages/var": 0.2459839752930515,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 1.7641408751334045,
"grad_norm": 0.22961791009181387,
"learning_rate": 7.261014389058382e-07,
"loss": 0.0,
"num_tokens": 66586048.0,
"reward": 0.73828125,
"reward_std": 0.13018609583377838,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 413
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 7.421606213341186e-09,
"advantages/std": 0.5960680246353149,
"advantages/var": 0.3552970899926464,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.768409818569904,
"grad_norm": 0.29006183656939377,
"learning_rate": 7.249082772961582e-07,
"loss": 0.0,
"num_tokens": 66734309.0,
"reward": 0.73828125,
"reward_std": 0.1726752519607544,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 414
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.4535812554776114e-10,
"advantages/std": 0.5227941870689392,
"advantages/var": 0.273313762033073,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.7726787620064033,
"grad_norm": 0.24788218742175322,
"learning_rate": 7.237135078969618e-07,
"loss": -0.0,
"num_tokens": 66888872.0,
"reward": 0.7265625,
"reward_std": 0.1417675018310547,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 415
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.7343076901567512e-09,
"advantages/std": 0.596061110496521,
"advantages/var": 0.3552888474463458,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.776947705442903,
"grad_norm": 0.24947353174478817,
"learning_rate": 7.225171392492315e-07,
"loss": -0.0,
"num_tokens": 67070163.0,
"reward": 0.49609375,
"reward_std": 0.1646634042263031,
"rewards/drgrpo_math_reward/mean": 0.49609375,
"rewards/drgrpo_math_reward/std": 0.5009641647338867,
"step": 416
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.7814572812374478e-09,
"advantages/std": 0.5227869153022766,
"advantages/var": 0.27330615881126974,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 1.7812166488794023,
"grad_norm": 0.24503746256418613,
"learning_rate": 7.21319179905383e-07,
"loss": -0.0,
"num_tokens": 67227683.0,
"reward": 0.69921875,
"reward_std": 0.1344023048877716,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 417
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 5.818178206786404e-09,
"advantages/std": 0.6402846574783325,
"advantages/var": 0.4099644426021456,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 1.7854855923159016,
"grad_norm": 0.27811569628160054,
"learning_rate": 7.201196384292026e-07,
"loss": 0.0,
"num_tokens": 67384174.0,
"reward": 0.6484375,
"reward_std": 0.20661331713199615,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 418
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 9.38902400835478e-10,
"advantages/std": 0.495963454246521,
"advantages/var": 0.24597974794814093,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 1.7897545357524014,
"grad_norm": 0.21451958553347664,
"learning_rate": 7.189185233957867e-07,
"loss": 0.0,
"num_tokens": 67546554.0,
"reward": 0.69140625,
"reward_std": 0.12612205743789673,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 419
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.2861694609705216e-09,
"advantages/std": 0.4959617853164673,
"advantages/var": 0.24597809249429758,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.7940234791889007,
"grad_norm": 0.23429005770960726,
"learning_rate": 7.17715843391481e-07,
"loss": 0.0,
"num_tokens": 67697689.0,
"reward": 0.71484375,
"reward_std": 0.12217916548252106,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 420
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.548311710357666,
"advantages/var": 0.30064573171534903,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.7982924226254002,
"grad_norm": 0.2657669321526178,
"learning_rate": 7.165116070138182e-07,
"loss": -0.0,
"num_tokens": 67847963.0,
"reward": 0.703125,
"reward_std": 0.15623344480991364,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 421
},
{
"advantages/mean": 3.259629011154175e-09,
"advantages/snr": 6.234897888925693e-09,
"advantages/std": 0.5228039026260376,
"advantages/var": 0.2733239206010154,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.8025613660618998,
"grad_norm": 0.2325526086698693,
"learning_rate": 7.153058228714573e-07,
"loss": -0.0,
"num_tokens": 67999484.0,
"reward": 0.77734375,
"reward_std": 0.1525476574897766,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 422
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 4.523380573164966e-09,
"advantages/std": 0.7206178903579712,
"advantages/var": 0.519290143903973,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.806830309498399,
"grad_norm": 0.29965403183055045,
"learning_rate": 7.140984995841213e-07,
"loss": 0.0,
"num_tokens": 68169225.0,
"reward": 0.70703125,
"reward_std": 0.26329755783081055,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 423
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.4675910174846649,
"advantages/var": 0.2186413596323442,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.8110992529348986,
"grad_norm": 0.22870163723664103,
"learning_rate": 7.128896457825363e-07,
"loss": 0.0,
"num_tokens": 68295369.0,
"reward": 0.84765625,
"reward_std": 0.10429336875677109,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"step": 424
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.75558162001709e-09,
"advantages/std": 0.49596714973449707,
"advantages/var": 0.24598341361576104,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.8153681963713981,
"grad_norm": 0.21145463928490568,
"learning_rate": 7.116792701083696e-07,
"loss": 0.0,
"num_tokens": 68449788.0,
"reward": 0.78125,
"reward_std": 0.12953945994377136,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 425
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.090965171223325e-09,
"advantages/std": 0.6402772068977356,
"advantages/var": 0.4099549016727657,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 1.8196371398078974,
"grad_norm": 0.3193763108454424,
"learning_rate": 7.104673812141675e-07,
"loss": -0.0,
"num_tokens": 68611334.0,
"reward": 0.6875,
"reward_std": 0.19647981226444244,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 426
},
{
"advantages/mean": -6.28642737865448e-09,
"advantages/snr": 9.506647061701755e-09,
"advantages/std": 0.6612665057182312,
"advantages/var": 0.4372733915847995,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.823906083244397,
"grad_norm": 0.30199023604753306,
"learning_rate": 7.092539877632939e-07,
"loss": 0.0,
"num_tokens": 68777395.0,
"reward": 0.75390625,
"reward_std": 0.19727861881256104,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 427
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.2196798431161686e-09,
"advantages/std": 0.572684645652771,
"advantages/var": 0.3279677033664399,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 1.8281750266808965,
"grad_norm": 0.2840608093900212,
"learning_rate": 7.080390984298686e-07,
"loss": 0.0,
"num_tokens": 68935328.0,
"reward": 0.6171875,
"reward_std": 0.16203844547271729,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 428
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.1125556422290086e-09,
"advantages/std": 0.3306383490562439,
"advantages/var": 0.10932171786663858,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.8324439701173958,
"grad_norm": 0.14263664676729204,
"learning_rate": 7.068227218987042e-07,
"loss": -0.0,
"num_tokens": 69075377.0,
"reward": 0.84765625,
"reward_std": 0.05273643508553505,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"step": 429
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.5478365019499404e-09,
"advantages/std": 0.5483019948005676,
"advantages/var": 0.3006350775022817,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.8367129135538955,
"grad_norm": 0.25853283598907206,
"learning_rate": 7.056048668652454e-07,
"loss": 0.0,
"num_tokens": 69226123.0,
"reward": 0.796875,
"reward_std": 0.14651167392730713,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 430
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.694585349261446e-09,
"advantages/std": 0.4959557056427002,
"advantages/var": 0.24597206195954868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 1.8409818569903948,
"grad_norm": 0.2377317602126353,
"learning_rate": 7.04385542035506e-07,
"loss": 0.0,
"num_tokens": 69377157.0,
"reward": 0.734375,
"reward_std": 0.11705183982849121,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 431
},
{
"advantages/mean": 3.725290298461914e-09,
"advantages/snr": 7.511243128167531e-09,
"advantages/std": 0.49596187472343445,
"advantages/var": 0.24597818117918369,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 1.8452508004268944,
"grad_norm": 0.26899356838560157,
"learning_rate": 7.031647561260065e-07,
"loss": -0.0,
"num_tokens": 69547761.0,
"reward": 0.51171875,
"reward_std": 0.1238841786980629,
"rewards/drgrpo_math_reward/mean": 0.51171875,
"rewards/drgrpo_math_reward/std": 0.5008418560028076,
"step": 432
},
{
"advantages/mean": -5.3551048040390015e-09,
"advantages/snr": 1.0243288273196778e-08,
"advantages/std": 0.5227915644645691,
"advantages/var": 0.2733110198753117,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.849519743863394,
"grad_norm": 0.2671321396595432,
"learning_rate": 7.019425178637126e-07,
"loss": -0.0,
"num_tokens": 69704764.0,
"reward": 0.72265625,
"reward_std": 0.13952717185020447,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 433
},
{
"advantages/mean": 6.05359673500061e-09,
"advantages/snr": 1.220555812162176e-08,
"advantages/std": 0.4959704875946045,
"advantages/var": 0.24598672456482973,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.8537886872998932,
"grad_norm": 0.22714143912809598,
"learning_rate": 7.007188359859726e-07,
"loss": -0.0,
"num_tokens": 69871863.0,
"reward": 0.5703125,
"reward_std": 0.1324264407157898,
"rewards/drgrpo_math_reward/mean": 0.5703125,
"rewards/drgrpo_math_reward/std": 0.4960011839866638,
"step": 434
},
{
"advantages/mean": 3.259629011154175e-09,
"advantages/snr": 6.572358668578157e-09,
"advantages/std": 0.4959602952003479,
"advantages/var": 0.24597661441521623,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.8580576307363927,
"grad_norm": 0.21561949231343305,
"learning_rate": 6.994937192404537e-07,
"loss": -0.0,
"num_tokens": 70037208.0,
"reward": 0.61328125,
"reward_std": 0.12164628505706787,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 435
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.1030515379783765e-09,
"advantages/std": 0.49594834446907043,
"advantages/var": 0.24596476038161175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 1.8623265741728923,
"grad_norm": 0.2307172453911008,
"learning_rate": 6.982671763850814e-07,
"loss": 0.0,
"num_tokens": 70169422.0,
"reward": 0.796875,
"reward_std": 0.11021704226732254,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 436
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.477184298733457e-09,
"advantages/std": 0.4676010310649872,
"advantages/var": 0.2186507242530391,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 1.8665955176093916,
"grad_norm": 0.2325116554205341,
"learning_rate": 6.970392161879755e-07,
"loss": 0.0,
"num_tokens": 70307647.0,
"reward": 0.7890625,
"reward_std": 0.11230767518281937,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 437
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.1498931253727876e-09,
"advantages/std": 0.4049604833126068,
"advantages/var": 0.1639929930447801,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.870864461045891,
"grad_norm": 0.19222639805887112,
"learning_rate": 6.95809847427388e-07,
"loss": -0.0,
"num_tokens": 70451518.0,
"reward": 0.74609375,
"reward_std": 0.09020812809467316,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 438
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 9.409914999290637e-09,
"advantages/std": 0.6185779571533203,
"advantages/var": 0.382638689075975,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.8751334044823906,
"grad_norm": 0.33670992741302663,
"learning_rate": 6.945790788916401e-07,
"loss": 0.0,
"num_tokens": 70590710.0,
"reward": 0.80078125,
"reward_std": 0.1989797204732895,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 439
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.979278129208303e-10,
"advantages/std": 0.46759918332099915,
"advantages/var": 0.21864899624246537,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 1.87940234791889,
"grad_norm": 0.22493791368848148,
"learning_rate": 6.933469193790599e-07,
"loss": 0.0,
"num_tokens": 70742283.0,
"reward": 0.75,
"reward_std": 0.11165857315063477,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 440
},
{
"advantages/mean": 3.259629011154175e-09,
"advantages/snr": 7.452157657712755e-09,
"advantages/std": 0.43740740418434143,
"advantages/var": 0.19132523723528383,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 1.8836712913553897,
"grad_norm": 0.2144476365775843,
"learning_rate": 6.921133776979186e-07,
"loss": -0.0,
"num_tokens": 70905201.0,
"reward": 0.640625,
"reward_std": 0.10520448535680771,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 441
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.43740326166152954,
"advantages/var": 0.19132161331214448,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 1.887940234791889,
"grad_norm": 0.23029173426421345,
"learning_rate": 6.908784626663681e-07,
"loss": -0.0,
"num_tokens": 71047781.0,
"reward": 0.66796875,
"reward_std": 0.10178709030151367,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 442
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.389170698522892e-10,
"advantages/std": 0.4959557056427002,
"advantages/var": 0.24597206195954868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.8922091782283885,
"grad_norm": 0.3264070723484753,
"learning_rate": 6.896421831123782e-07,
"loss": -0.0,
"num_tokens": 71204495.0,
"reward": 0.703125,
"reward_std": 0.11705183982849121,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 443
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.4083860990095324e-09,
"advantages/std": 0.4959520101547241,
"advantages/var": 0.24596839637651158,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 1.896478121664888,
"grad_norm": 0.23071652035781837,
"learning_rate": 6.884045478736731e-07,
"loss": -0.0,
"num_tokens": 71349483.0,
"reward": 0.70703125,
"reward_std": 0.11363443732261658,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 444
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.129211343901943e-09,
"advantages/std": 0.6185661554336548,
"advantages/var": 0.38262408864797237,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 1.9007470651013874,
"grad_norm": 0.26391827209425767,
"learning_rate": 6.871655657976681e-07,
"loss": -0.0,
"num_tokens": 71532006.0,
"reward": 0.61328125,
"reward_std": 0.18201632797718048,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 445
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.3436707446085615e-09,
"advantages/std": 0.5960665941238403,
"advantages/var": 0.355295384630395,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 1.9050160085378869,
"grad_norm": 0.2605638541707349,
"learning_rate": 6.859252457414066e-07,
"loss": 0.0,
"num_tokens": 71686355.0,
"reward": 0.72265625,
"reward_std": 0.1720261573791504,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 446
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 4.8931765453848975e-09,
"advantages/std": 0.6185753345489502,
"advantages/var": 0.38263544451234566,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.9092849519743864,
"grad_norm": 0.26190728557911336,
"learning_rate": 6.84683596571497e-07,
"loss": 0.0,
"num_tokens": 71852703.0,
"reward": 0.65234375,
"reward_std": 0.19450394809246063,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 447
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.6985172313936096e-09,
"advantages/std": 0.5483150482177734,
"advantages/var": 0.3006493921020592,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 1.9135538954108857,
"grad_norm": 0.23280151099368152,
"learning_rate": 6.834406271640487e-07,
"loss": -0.0,
"num_tokens": 72011642.0,
"reward": 0.66796875,
"reward_std": 0.15965083241462708,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 448
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 3.779101397944587e-09,
"advantages/std": 0.36966031789779663,
"advantages/var": 0.13664875062830006,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 1.9178228388473852,
"grad_norm": 0.17630788878023093,
"learning_rate": 6.821963464046095e-07,
"loss": 0.0,
"num_tokens": 72147983.0,
"reward": 0.69921875,
"reward_std": 0.062077511101961136,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 449
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.449885533156584e-09,
"advantages/std": 0.404936283826828,
"advantages/var": 0.1639733939594814,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.9220917822838848,
"grad_norm": 0.18605668705259262,
"learning_rate": 6.809507631881013e-07,
"loss": -0.0,
"num_tokens": 72302131.0,
"reward": 0.67578125,
"reward_std": 0.06970866024494171,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 450
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 7.318148370494186e-09,
"advantages/std": 0.5726792216300964,
"advantages/var": 0.3279614908868531,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.926360725720384,
"grad_norm": 0.29718149010370043,
"learning_rate": 6.797038864187563e-07,
"loss": 0.0,
"num_tokens": 72444448.0,
"reward": 0.734375,
"reward_std": 0.154791921377182,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 451
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.8181833977205966e-09,
"advantages/std": 0.6402837038040161,
"advantages/var": 0.40996322135698904,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.9306296691568838,
"grad_norm": 0.30249912557639164,
"learning_rate": 6.78455725010055e-07,
"loss": 0.0,
"num_tokens": 72609549.0,
"reward": 0.66015625,
"reward_std": 0.20490585267543793,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 452
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.7342839024723756e-09,
"advantages/std": 0.5960662961006165,
"advantages/var": 0.35529502934710777,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 1.9348986125933831,
"grad_norm": 0.2547498641016505,
"learning_rate": 6.772062878846603e-07,
"loss": 0.0,
"num_tokens": 72767868.0,
"reward": 0.65625,
"reward_std": 0.1714957356452942,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 453
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 6.29830531336252e-09,
"advantages/std": 0.36967188119888306,
"advantages/var": 0.1366572997491211,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 1.9391675560298824,
"grad_norm": 0.17803173936691005,
"learning_rate": 6.759555839743549e-07,
"loss": 0.0,
"num_tokens": 72917624.0,
"reward": 0.6640625,
"reward_std": 0.07061977684497833,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 454
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.907301639445506e-10,
"advantages/std": 0.522786021232605,
"advantages/var": 0.2733052239962177,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.9434364994663822,
"grad_norm": 0.23256361730744385,
"learning_rate": 6.747036222199783e-07,
"loss": -0.0,
"num_tokens": 73077423.0,
"reward": 0.65234375,
"reward_std": 0.1332252323627472,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 455
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.1640342645653495e-09,
"advantages/std": 0.4959566295146942,
"advantages/var": 0.24597297835957566,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.9477054429028815,
"grad_norm": 0.23992973512319926,
"learning_rate": 6.734504115713602e-07,
"loss": -0.0,
"num_tokens": 73237735.0,
"reward": 0.71875,
"reward_std": 0.11822889000177383,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 456
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.163920693670164e-09,
"advantages/std": 0.4959675371646881,
"advantages/var": 0.24598379792120628,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.951974386339381,
"grad_norm": 0.24022370638212284,
"learning_rate": 6.721959609872598e-07,
"loss": 0.0,
"num_tokens": 73394434.0,
"reward": 0.75390625,
"reward_std": 0.1267760694026947,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 457
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5056151252025905e-09,
"advantages/std": 0.6185661554336548,
"advantages/var": 0.38262408864797237,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.9562433297758806,
"grad_norm": 0.2696626607781293,
"learning_rate": 6.709402794352992e-07,
"loss": 0.0,
"num_tokens": 73568998.0,
"reward": 0.66015625,
"reward_std": 0.18201632797718048,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 458
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.0327421924445997e-09,
"advantages/std": 0.5727008581161499,
"advantages/var": 0.32798627288697446,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 1.9605122732123799,
"grad_norm": 0.2694609947093644,
"learning_rate": 6.696833758919005e-07,
"loss": -0.0,
"num_tokens": 73714000.0,
"reward": 0.75390625,
"reward_std": 0.18094666302204132,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 459
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.3361051476250139e-09,
"advantages/std": 0.5227821469306946,
"advantages/var": 0.27330117314946634,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 1.9647812166488794,
"grad_norm": 0.2129444098803062,
"learning_rate": 6.684252593422213e-07,
"loss": -0.0,
"num_tokens": 73875813.0,
"reward": 0.68359375,
"reward_std": 0.1275724172592163,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 460
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.129194478589663e-09,
"advantages/std": 0.618575394153595,
"advantages/var": 0.3826355182522754,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 1.969050160085379,
"grad_norm": 0.26853815528519137,
"learning_rate": 6.671659387800908e-07,
"loss": 0.0,
"num_tokens": 74037738.0,
"reward": 0.60546875,
"reward_std": 0.19450394809246063,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 461
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.633534917323103e-09,
"advantages/std": 0.495952844619751,
"advantages/var": 0.24596922408642286,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.9733191035218782,
"grad_norm": 0.2702401086641601,
"learning_rate": 6.659054232079452e-07,
"loss": 0.0,
"num_tokens": 74175028.0,
"reward": 0.79296875,
"reward_std": 0.11310647428035736,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 462
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 4.000009060457362e-09,
"advantages/std": 0.6402828097343445,
"advantages/var": 0.4099620764413068,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 1.9775880469583778,
"grad_norm": 0.3298380348570836,
"learning_rate": 6.646437216367633e-07,
"loss": 0.0,
"num_tokens": 74337345.0,
"reward": 0.578125,
"reward_std": 0.20490339398384094,
"rewards/drgrpo_math_reward/mean": 0.578125,
"rewards/drgrpo_math_reward/std": 0.49482619762420654,
"step": 463
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.898997469490353e-09,
"advantages/std": 0.5227879881858826,
"advantages/var": 0.2733072805914425,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 1.9818569903948773,
"grad_norm": 0.22513828791113058,
"learning_rate": 6.633808430860019e-07,
"loss": 0.0,
"num_tokens": 74485423.0,
"reward": 0.671875,
"reward_std": 0.1344047486782074,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 464
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.9724516843147123e-09,
"advantages/std": 0.5483064651489258,
"advantages/var": 0.30063997972411016,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.9861259338313766,
"grad_norm": 0.25301570427584996,
"learning_rate": 6.621167965835322e-07,
"loss": -0.0,
"num_tokens": 74634718.0,
"reward": 0.609375,
"reward_std": 0.15163654088974,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 465
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.3472887252249853e-09,
"advantages/std": 0.49595654010772705,
"advantages/var": 0.24597288967562747,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 1.9903948772678763,
"grad_norm": 0.2737895419841632,
"learning_rate": 6.608515911655743e-07,
"loss": 0.0,
"num_tokens": 74784603.0,
"reward": 0.703125,
"reward_std": 0.1165238693356514,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 466
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.45366097526879e-10,
"advantages/std": 0.5227848291397095,
"advantages/var": 0.2733039775786352,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 1.9946638207043756,
"grad_norm": 0.2609088064315718,
"learning_rate": 6.595852358766333e-07,
"loss": -0.0,
"num_tokens": 74927831.0,
"reward": 0.6953125,
"reward_std": 0.13151778280735016,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 467
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.131246346616979e-09,
"advantages/std": 0.5726813077926636,
"advantages/var": 0.3279638802951155,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 1.9989327641408752,
"grad_norm": 0.29652117636771275,
"learning_rate": 6.583177397694337e-07,
"loss": 0.0,
"num_tokens": 75075603.0,
"reward": 0.8046875,
"reward_std": 0.15650182962417603,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 468
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.218888138077328e-09,
"advantages/std": 0.5483006238937378,
"advantages/var": 0.3006335741622621,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 2.0042689434364993,
"grad_norm": 0.3051475077518512,
"learning_rate": 6.570491119048558e-07,
"loss": 0.0,
"num_tokens": 75201411.0,
"reward": 0.78125,
"reward_std": 0.1442737877368927,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 469
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.439363748430499e-09,
"advantages/std": 0.5726836919784546,
"advantages/var": 0.32796661105807345,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 2.008537886872999,
"grad_norm": 0.2751378768343912,
"learning_rate": 6.557793613518703e-07,
"loss": 0.0,
"num_tokens": 75352067.0,
"reward": 0.58984375,
"reward_std": 0.15874217450618744,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 470
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.6721843985338186e-09,
"advantages/std": 0.5227872133255005,
"advantages/var": 0.27330647041664236,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.0128068303094984,
"grad_norm": 0.2549435165894436,
"learning_rate": 6.545084971874736e-07,
"loss": -0.0,
"num_tokens": 75507484.0,
"reward": 0.7265625,
"reward_std": 0.1349327266216278,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 471
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.1248571416022253e-09,
"advantages/std": 0.5960736870765686,
"advantages/var": 0.35530384042505503,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.0170757737459977,
"grad_norm": 0.29969731602875654,
"learning_rate": 6.532365284966232e-07,
"loss": 0.0,
"num_tokens": 75660148.0,
"reward": 0.734375,
"reward_std": 0.18056842684745789,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 472
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 1.0615912720815922e-08,
"advantages/std": 0.5483057498931885,
"advantages/var": 0.30063919536593176,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.0213447171824974,
"grad_norm": 0.29207596259062774,
"learning_rate": 6.51963464372172e-07,
"loss": 0.0,
"num_tokens": 75805634.0,
"reward": 0.6796875,
"reward_std": 0.14887069165706635,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 473
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.687273994754044e-09,
"advantages/std": 0.596075177192688,
"advantages/var": 0.3553056168652944,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 2.0256136606189967,
"grad_norm": 0.25542200864749953,
"learning_rate": 6.50689313914804e-07,
"loss": 0.0,
"num_tokens": 75962422.0,
"reward": 0.734375,
"reward_std": 0.1812175214290619,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 474
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.570994047501006e-09,
"advantages/std": 0.5228006839752197,
"advantages/var": 0.27332055516495757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.0298826040554965,
"grad_norm": 0.33695170726332435,
"learning_rate": 6.494140862329687e-07,
"loss": 0.0,
"num_tokens": 76126420.0,
"reward": 0.59765625,
"reward_std": 0.14966067671775818,
"rewards/drgrpo_math_reward/mean": 0.59765625,
"rewards/drgrpo_math_reward/std": 0.4913311004638672,
"step": 475
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.4374101758003235,
"advantages/var": 0.1913276618936699,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 2.034151547491996,
"grad_norm": 0.17944291081012842,
"learning_rate": 6.48137790442817e-07,
"loss": -0.0,
"num_tokens": 76278734.0,
"reward": 0.6640625,
"reward_std": 0.10691440105438232,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 476
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.323050633327592e-10,
"advantages/std": 0.437400758266449,
"advantages/var": 0.19131942333206453,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.038420490928495,
"grad_norm": 0.2104308698623457,
"learning_rate": 6.468604356681347e-07,
"loss": 0.0,
"num_tokens": 76419959.0,
"reward": 0.7734375,
"reward_std": 0.09890255331993103,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 477
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.49597543478012085,
"advantages/var": 0.2459916319053299,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.042689434364995,
"grad_norm": 0.21642043572235753,
"learning_rate": 6.45582031040278e-07,
"loss": -0.0,
"num_tokens": 76584561.0,
"reward": 0.703125,
"reward_std": 0.13755130767822266,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 478
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.1718066640118346e-09,
"advantages/std": 0.5960811972618103,
"advantages/var": 0.3553127937290732,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.046958377801494,
"grad_norm": 0.3204108965931746,
"learning_rate": 6.443025856981084e-07,
"loss": -0.0,
"num_tokens": 76727032.0,
"reward": 0.7890625,
"reward_std": 0.18964111804962158,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 479
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.9875465035951096e-09,
"advantages/std": 0.46760237216949463,
"advantages/var": 0.21865197845853857,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.0512273212379935,
"grad_norm": 0.23652859766283577,
"learning_rate": 6.430221087879271e-07,
"loss": -0.0,
"num_tokens": 76866220.0,
"reward": 0.74609375,
"reward_std": 0.114015132188797,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 480
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.9916905604763657e-09,
"advantages/std": 0.46760404109954834,
"advantages/var": 0.2186535392526281,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.055496264674493,
"grad_norm": 0.24549528815198077,
"learning_rate": 6.417406094634089e-07,
"loss": -0.0,
"num_tokens": 77014076.0,
"reward": 0.64453125,
"reward_std": 0.11625301837921143,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 481
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 7.527941429292418e-09,
"advantages/std": 0.6185771822929382,
"advantages/var": 0.38263773045347094,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.0597652081109925,
"grad_norm": 0.29132209347309834,
"learning_rate": 6.404580968855384e-07,
"loss": -0.0,
"num_tokens": 77182637.0,
"reward": 0.69921875,
"reward_std": 0.19780266284942627,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 482
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.286193157034731e-09,
"advantages/std": 0.49595820903778076,
"advantages/var": 0.24597454511196304,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 2.064034151547492,
"grad_norm": 0.247701806335025,
"learning_rate": 6.391745802225434e-07,
"loss": -0.0,
"num_tokens": 77327901.0,
"reward": 0.734375,
"reward_std": 0.12046677619218826,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 483
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 7.030967940225528e-09,
"advantages/std": 0.5960703492164612,
"advantages/var": 0.355299861215034,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.0683030949839916,
"grad_norm": 0.3241457763923432,
"learning_rate": 6.378900686498288e-07,
"loss": 0.0,
"num_tokens": 77484223.0,
"reward": 0.7421875,
"reward_std": 0.17662061750888824,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 484
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 8.917319186133034e-09,
"advantages/std": 0.5483086705207825,
"advantages/var": 0.300642398168268,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.072572038420491,
"grad_norm": 0.26748724345829983,
"learning_rate": 6.366045713499128e-07,
"loss": 0.0,
"num_tokens": 77636489.0,
"reward": 0.8203125,
"reward_std": 0.1533464789390564,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 485
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 6.298352535277312e-09,
"advantages/std": 0.369669109582901,
"advantages/var": 0.13665525057981487,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.07684098185699,
"grad_norm": 0.16791198948334604,
"learning_rate": 6.353180975123594e-07,
"loss": 0.0,
"num_tokens": 77785589.0,
"reward": 0.73828125,
"reward_std": 0.06944026052951813,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 486
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.958618460012092e-10,
"advantages/std": 0.46759626269340515,
"advantages/var": 0.21864626488483996,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 2.08110992529349,
"grad_norm": 0.21844686995293106,
"learning_rate": 6.340306563337141e-07,
"loss": -0.0,
"num_tokens": 77934274.0,
"reward": 0.63671875,
"reward_std": 0.10941823571920395,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 487
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.845910451722217e-09,
"advantages/std": 0.572686493396759,
"advantages/var": 0.3279698197190761,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 2.0853788687299892,
"grad_norm": 0.27969529596515874,
"learning_rate": 6.327422570174372e-07,
"loss": -0.0,
"num_tokens": 78088673.0,
"reward": 0.69140625,
"reward_std": 0.1632179617881775,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 488
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 8.20278608731468e-09,
"advantages/std": 0.5960710644721985,
"advantages/var": 0.3553007139010198,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.089647812166489,
"grad_norm": 0.25314858952478914,
"learning_rate": 6.314529087738386e-07,
"loss": 0.0,
"num_tokens": 78252846.0,
"reward": 0.765625,
"reward_std": 0.17609265446662903,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 489
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 6.3376001869689765e-09,
"advantages/std": 0.6612836718559265,
"advantages/var": 0.4372960946632567,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.0939167556029883,
"grad_norm": 0.28461404439506616,
"learning_rate": 6.301626208200115e-07,
"loss": 0.0,
"num_tokens": 78407410.0,
"reward": 0.75390625,
"reward_std": 0.2205488383769989,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 490
},
{
"advantages/mean": 3.725290298461914e-09,
"advantages/snr": 7.511337010179756e-09,
"advantages/std": 0.4959556758403778,
"advantages/var": 0.2459720323982859,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.0981856990394876,
"grad_norm": 0.20324479481628369,
"learning_rate": 6.288714023797671e-07,
"loss": -0.0,
"num_tokens": 78560524.0,
"reward": 0.7578125,
"reward_std": 0.11705183982849121,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 491
},
{
"advantages/mean": 4.889443516731262e-09,
"advantages/snr": 8.917142764397378e-09,
"advantages/std": 0.5483195185661316,
"advantages/var": 0.3006542944405943,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 2.1024546424759873,
"grad_norm": 0.24222228283880398,
"learning_rate": 6.275792626835679e-07,
"loss": -0.0,
"num_tokens": 78715417.0,
"reward": 0.62109375,
"reward_std": 0.16477571427822113,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 492
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5483072400093079,
"advantages/var": 0.30064082944662474,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.1067235859124867,
"grad_norm": 0.2696121215659343,
"learning_rate": 6.262862109684625e-07,
"loss": -0.0,
"num_tokens": 78864264.0,
"reward": 0.6953125,
"reward_std": 0.15110859274864197,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 493
},
{
"advantages/mean": 5.122274160385132e-09,
"advantages/snr": 9.79779051410865e-09,
"advantages/std": 0.5227988958358765,
"advantages/var": 0.2733186854872116,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.110992529348986,
"grad_norm": 0.22331423821502822,
"learning_rate": 6.249922564780192e-07,
"loss": -0.0,
"num_tokens": 79021568.0,
"reward": 0.71875,
"reward_std": 0.14689236879348755,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 494
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755670987579601e-09,
"advantages/std": 0.49595534801483154,
"advantages/var": 0.24597170722451267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.1152614727854857,
"grad_norm": 0.2496596017738257,
"learning_rate": 6.236974084622597e-07,
"loss": 0.0,
"num_tokens": 79173348.0,
"reward": 0.67578125,
"reward_std": 0.11652141809463501,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 495
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 7.452419155772434e-09,
"advantages/std": 0.43739205598831177,
"advantages/var": 0.19131181064168246,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.119530416221985,
"grad_norm": 0.12826516345732708,
"learning_rate": 6.224016761775932e-07,
"loss": 0.0,
"num_tokens": 79326972.0,
"reward": 0.76953125,
"reward_std": 0.09153735637664795,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 496
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 9.388838395442294e-10,
"advantages/std": 0.49597325921058655,
"advantages/var": 0.24598947385197167,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.1237993596584843,
"grad_norm": 0.2844272076231935,
"learning_rate": 6.211050688867503e-07,
"loss": -0.0,
"num_tokens": 79483123.0,
"reward": 0.73828125,
"reward_std": 0.13466677069664001,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 497
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 4.2250808180061485e-09,
"advantages/std": 0.3306407332420349,
"advantages/var": 0.10932329447883049,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 2.128068303094984,
"grad_norm": 0.15637960274214446,
"learning_rate": 6.198075958587167e-07,
"loss": 0.0,
"num_tokens": 79636927.0,
"reward": 0.74609375,
"reward_std": 0.05497432500123978,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 498
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.9060327590462414e-10,
"advantages/std": 0.5960795879364014,
"advantages/var": 0.35531087515443005,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 2.1323372465314834,
"grad_norm": 0.3053836619718508,
"learning_rate": 6.18509266368667e-07,
"loss": 0.0,
"num_tokens": 79798753.0,
"reward": 0.66796875,
"reward_std": 0.18687278032302856,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 499
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.9876135291586783e-09,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.136606189967983,
"grad_norm": 0.2533729605115467,
"learning_rate": 6.172100896978985e-07,
"loss": 0.0,
"num_tokens": 79943968.0,
"reward": 0.73828125,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 500
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1231767740615974e-09,
"advantages/std": 0.5483072400093079,
"advantages/var": 0.30064082944662474,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.1408751334044824,
"grad_norm": 0.33647486360513806,
"learning_rate": 6.159100751337641e-07,
"loss": -0.0,
"num_tokens": 80088298.0,
"reward": 0.7890625,
"reward_std": 0.15110856294631958,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 501
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 4.481332608054646e-09,
"advantages/std": 0.4676010310649872,
"advantages/var": 0.2186507242530391,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.1451440768409817,
"grad_norm": 0.28214034878312183,
"learning_rate": 6.146092319696072e-07,
"loss": -0.0,
"num_tokens": 80234565.0,
"reward": 0.6953125,
"reward_std": 0.11230766773223877,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 502
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.489668896152697e-09,
"advantages/std": 0.46759358048439026,
"advantages/var": 0.21864375651021195,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 2.1494130202774815,
"grad_norm": 0.23343503528779358,
"learning_rate": 6.133075695046943e-07,
"loss": 0.0,
"num_tokens": 80368916.0,
"reward": 0.78515625,
"reward_std": 0.1060032919049263,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 503
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 4.140424438653788e-09,
"advantages/std": 0.6185687184333801,
"advantages/var": 0.3826272594243143,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.153681963713981,
"grad_norm": 0.2595846374198061,
"learning_rate": 6.120050970441485e-07,
"loss": -0.0,
"num_tokens": 80525671.0,
"reward": 0.67578125,
"reward_std": 0.18649208545684814,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 504
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.749637419262528e-10,
"advantages/std": 0.4049483835697174,
"advantages/var": 0.16398319335572697,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.15795090715048,
"grad_norm": 0.19438375634725735,
"learning_rate": 6.107018238988837e-07,
"loss": -0.0,
"num_tokens": 80670529.0,
"reward": 0.74609375,
"reward_std": 0.07995839416980743,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 505
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.6615187883262115e-09,
"advantages/std": 0.43740183115005493,
"advantages/var": 0.19132036189342116,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.16221985058698,
"grad_norm": 0.22311545662022994,
"learning_rate": 6.093977593855375e-07,
"loss": 0.0,
"num_tokens": 80819363.0,
"reward": 0.7578125,
"reward_std": 0.10007961094379425,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 506
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.794238057725804e-09,
"advantages/std": 0.5483013987541199,
"advantages/var": 0.30063442387572437,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.166488794023479,
"grad_norm": 0.22187008401811345,
"learning_rate": 6.080929128264045e-07,
"loss": 0.0,
"num_tokens": 80982643.0,
"reward": 0.7265625,
"reward_std": 0.14545084536075592,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 507
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.962789174963654e-09,
"advantages/std": 0.46759456396102905,
"advantages/var": 0.2186446762459049,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.1707577374599785,
"grad_norm": 0.2494787558010566,
"learning_rate": 6.067872935493702e-07,
"loss": 0.0,
"num_tokens": 81118448.0,
"reward": 0.80078125,
"reward_std": 0.10718034207820892,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 508
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.562928780215032e-09,
"advantages/std": 0.5227848291397095,
"advantages/var": 0.2733039775786352,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.175026680896478,
"grad_norm": 0.28498267319507764,
"learning_rate": 6.054809108878437e-07,
"loss": 0.0,
"num_tokens": 81263747.0,
"reward": 0.75,
"reward_std": 0.13151778280735016,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 509
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.919888970950097e-09,
"advantages/std": 0.43740561604499817,
"advantages/var": 0.19132367294770436,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.1792956243329775,
"grad_norm": 0.21060269613003632,
"learning_rate": 6.041737741806913e-07,
"loss": 0.0,
"num_tokens": 81408764.0,
"reward": 0.78125,
"reward_std": 0.10296659171581268,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 510
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.816804076067719e-09,
"advantages/std": 0.33063092827796936,
"advantages/var": 0.10931681073395172,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.1835645677694773,
"grad_norm": 0.14004657643654717,
"learning_rate": 6.028658927721697e-07,
"loss": -0.0,
"num_tokens": 81541398.0,
"reward": 0.80078125,
"reward_std": 0.04761157184839249,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 511
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196592786189577e-09,
"advantages/std": 0.5726943016052246,
"advantages/var": 0.32797876309109597,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 2.1878335112059766,
"grad_norm": 0.3074099145428851,
"learning_rate": 6.015572760118596e-07,
"loss": 0.0,
"num_tokens": 81702611.0,
"reward": 0.67578125,
"reward_std": 0.17399565875530243,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 512
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.877807622595892e-09,
"advantages/std": 0.4959627091884613,
"advantages/var": 0.24597900890555824,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.192102454642476,
"grad_norm": 0.2443639003613943,
"learning_rate": 6.002479332545981e-07,
"loss": 0.0,
"num_tokens": 81850958.0,
"reward": 0.70703125,
"reward_std": 0.12335620820522308,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 513
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.8168092671155283e-09,
"advantages/std": 0.4959454834461212,
"advantages/var": 0.2459619225506069,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 2.1963713980789756,
"grad_norm": 0.18910867071039786,
"learning_rate": 5.989378738604121e-07,
"loss": 0.0,
"num_tokens": 82004261.0,
"reward": 0.73828125,
"reward_std": 0.10627168416976929,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 514
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.45006423038189e-09,
"advantages/std": 0.4959668219089508,
"advantages/var": 0.24598308843446492,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.200640341515475,
"grad_norm": 0.19823134761784342,
"learning_rate": 5.976271071944516e-07,
"loss": 0.0,
"num_tokens": 82159605.0,
"reward": 0.69140625,
"reward_std": 0.12900903820991516,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 515
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.481341176536925e-09,
"advantages/std": 0.46760013699531555,
"advantages/var": 0.21864988811803787,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.2049092849519742,
"grad_norm": 0.21387409554665499,
"learning_rate": 5.963156426269227e-07,
"loss": -0.0,
"num_tokens": 82300202.0,
"reward": 0.796875,
"reward_std": 0.11283563077449799,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 516
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.6721843985338186e-09,
"advantages/std": 0.5227872133255005,
"advantages/var": 0.27330647041664236,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.209178228388474,
"grad_norm": 0.2458970836003116,
"learning_rate": 5.950034895330204e-07,
"loss": -0.0,
"num_tokens": 82450773.0,
"reward": 0.7578125,
"reward_std": 0.1349327266216278,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 517
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.492866815227978e-10,
"advantages/std": 0.5482969284057617,
"advantages/var": 0.300629521699193,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.2134471718249733,
"grad_norm": 0.2448989545301728,
"learning_rate": 5.936906572928624e-07,
"loss": -0.0,
"num_tokens": 82591859.0,
"reward": 0.71875,
"reward_std": 0.14032597839832306,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 518
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5960641503334045,
"advantages/var": 0.3552924713126835,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.2177161152614726,
"grad_norm": 0.3312415074798024,
"learning_rate": 5.923771552914201e-07,
"loss": -0.0,
"num_tokens": 82763398.0,
"reward": 0.5546875,
"reward_std": 0.16808080673217773,
"rewards/drgrpo_math_reward/mean": 0.5546875,
"rewards/drgrpo_math_reward/std": 0.49797385931015015,
"step": 519
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.472177112287613e-09,
"advantages/std": 0.5726823806762695,
"advantages/var": 0.3279651091370397,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.2219850586979724,
"grad_norm": 0.2937235840964889,
"learning_rate": 5.91062992918454e-07,
"loss": 0.0,
"num_tokens": 82914373.0,
"reward": 0.74609375,
"reward_std": 0.15820932388305664,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 520
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.572451479362036e-09,
"advantages/std": 0.4959532916545868,
"advantages/var": 0.24596966750301963,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.2262540021344717,
"grad_norm": 0.21928040534652227,
"learning_rate": 5.897481795684446e-07,
"loss": -0.0,
"num_tokens": 83078131.0,
"reward": 0.703125,
"reward_std": 0.115341916680336,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 521
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.2524805971912827e-09,
"advantages/std": 0.5726844668388367,
"advantages/var": 0.3279674985584826,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.2305229455709714,
"grad_norm": 0.284497900137537,
"learning_rate": 5.884327246405262e-07,
"loss": -0.0,
"num_tokens": 83237505.0,
"reward": 0.65234375,
"reward_std": 0.15991924703121185,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 522
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755668505089878e-09,
"advantages/std": 0.4959556758403778,
"advantages/var": 0.2459720323982859,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.2347918890074707,
"grad_norm": 0.25720163427951526,
"learning_rate": 5.8711663753842e-07,
"loss": 0.0,
"num_tokens": 83375158.0,
"reward": 0.8125,
"reward_std": 0.11705183982849121,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"step": 523
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.225095585998201e-09,
"advantages/std": 0.4959593713283539,
"advantages/var": 0.245975698008416,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.23906083244397,
"grad_norm": 0.25599797050896705,
"learning_rate": 5.857999276703657e-07,
"loss": 0.0,
"num_tokens": 83542285.0,
"reward": 0.64453125,
"reward_std": 0.12046922743320465,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 524
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.252482966806137e-09,
"advantages/std": 0.5726840496063232,
"advantages/var": 0.3279670206734977,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.24332977588047,
"grad_norm": 0.29768190456929716,
"learning_rate": 5.84482604449055e-07,
"loss": 0.0,
"num_tokens": 83686746.0,
"reward": 0.75,
"reward_std": 0.16097761690616608,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 525
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.4083492500432603e-09,
"advantages/std": 0.6612866520881653,
"advantages/var": 0.43730003622997415,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.247598719316969,
"grad_norm": 0.32444568808383006,
"learning_rate": 5.83164677291565e-07,
"loss": 0.0,
"num_tokens": 83839387.0,
"reward": 0.67578125,
"reward_std": 0.22449666261672974,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 526
},
{
"advantages/mean": 4.6566128730773926e-09,
"advantages/snr": 9.958571491388554e-09,
"advantages/std": 0.46759846806526184,
"advantages/var": 0.2186483273369797,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 2.2518676627534684,
"grad_norm": 0.20529368451226218,
"learning_rate": 5.818461556192892e-07,
"loss": -0.0,
"num_tokens": 84014173.0,
"reward": 0.578125,
"reward_std": 0.11059774458408356,
"rewards/drgrpo_math_reward/mean": 0.578125,
"rewards/drgrpo_math_reward/std": 0.49482619762420654,
"step": 527
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5482991337776184,
"advantages/var": 0.3006319401012867,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.256136606189968,
"grad_norm": 0.26826764902683475,
"learning_rate": 5.805270488578714e-07,
"loss": -0.0,
"num_tokens": 84163860.0,
"reward": 0.7578125,
"reward_std": 0.14203590154647827,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 528
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 9.958740326614902e-10,
"advantages/std": 0.4675905406475067,
"advantages/var": 0.21864091370302763,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.2604055496264674,
"grad_norm": 0.28990349934920284,
"learning_rate": 5.792073664371383e-07,
"loss": 0.0,
"num_tokens": 84307791.0,
"reward": 0.6953125,
"reward_std": 0.10205793380737305,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 529
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.123717230213227e-09,
"advantages/std": 0.6816261410713196,
"advantages/var": 0.46461419619177846,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.2646744930629668,
"grad_norm": 0.6035433531569014,
"learning_rate": 5.778871177910315e-07,
"loss": -0.0,
"num_tokens": 84478520.0,
"reward": 0.70703125,
"reward_std": 0.22081723809242249,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 530
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.979264165711202e-10,
"advantages/std": 0.4676004946231842,
"advantages/var": 0.21865022257184652,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.2689434364994665,
"grad_norm": 0.23720868638509054,
"learning_rate": 5.7656631235754e-07,
"loss": 0.0,
"num_tokens": 84623133.0,
"reward": 0.71484375,
"reward_std": 0.113366037607193,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 531
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.323178301971e-10,
"advantages/std": 0.4373902678489685,
"advantages/var": 0.1913102464089924,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.273212379935966,
"grad_norm": 0.1800617723214338,
"learning_rate": 5.752449595786341e-07,
"loss": 0.0,
"num_tokens": 84771437.0,
"reward": 0.71484375,
"reward_std": 0.08929946273565292,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 532
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.43739479780197144,
"advantages/var": 0.19131420914422748,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.2774813233724656,
"grad_norm": 0.24425106303274358,
"learning_rate": 5.739230689001955e-07,
"loss": 0.0,
"num_tokens": 84929213.0,
"reward": 0.69921875,
"reward_std": 0.09324727952480316,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 533
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.6262301431742259e-09,
"advantages/std": 0.5726880431175232,
"advantages/var": 0.3279715947297781,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 2.281750266808965,
"grad_norm": 0.26506567231170286,
"learning_rate": 5.726006497719524e-07,
"loss": -0.0,
"num_tokens": 85093340.0,
"reward": 0.62890625,
"reward_std": 0.16557206213474274,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 534
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.979420942197835e-10,
"advantages/std": 0.4675857722759247,
"advantages/var": 0.2186364544348729,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 2.286019210245464,
"grad_norm": 0.20899718020336794,
"learning_rate": 5.712777116474102e-07,
"loss": 0.0,
"num_tokens": 85244179.0,
"reward": 0.73828125,
"reward_std": 0.09916850179433823,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 535
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 8.049523192160078e-09,
"advantages/std": 0.40494683384895325,
"advantages/var": 0.16398193824429175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.2902881536819635,
"grad_norm": 0.19129106216091937,
"learning_rate": 5.699542639837843e-07,
"loss": 0.0,
"num_tokens": 85388966.0,
"reward": 0.8125,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"step": 536
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5483127236366272,
"advantages/var": 0.3006468429018163,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.2945570971184632,
"grad_norm": 0.25350005366655237,
"learning_rate": 5.686303162419324e-07,
"loss": 0.0,
"num_tokens": 85550800.0,
"reward": 0.67578125,
"reward_std": 0.15623590350151062,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 537
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 5.164009439899339e-09,
"advantages/std": 0.49595901370048523,
"advantages/var": 0.2459753432707581,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 2.2988260405549625,
"grad_norm": 0.2590780841185054,
"learning_rate": 5.673058778862878e-07,
"loss": -0.0,
"num_tokens": 85718806.0,
"reward": 0.6484375,
"reward_std": 0.11993881314992905,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 538
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 5.1746161723483324e-09,
"advantages/std": 0.40495288372039795,
"advantages/var": 0.16398683803346614,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.3030949839914623,
"grad_norm": 0.16751371969958892,
"learning_rate": 5.659809583847907e-07,
"loss": 0.0,
"num_tokens": 85878807.0,
"reward": 0.640625,
"reward_std": 0.08337579667568207,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 539
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.1291852300839092e-09,
"advantages/std": 0.6185804605484009,
"advantages/var": 0.38264178617227174,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.3073639274279616,
"grad_norm": 0.29921396743821344,
"learning_rate": 5.646555672088202e-07,
"loss": 0.0,
"num_tokens": 86036114.0,
"reward": 0.67578125,
"reward_std": 0.20345547795295715,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 540
},
{
"advantages/mean": -5.587935447692871e-09,
"advantages/snr": 1.0688468277682715e-08,
"advantages/std": 0.5228003859519958,
"advantages/var": 0.2733202435515558,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 2.311632870864461,
"grad_norm": 0.18961370483557366,
"learning_rate": 5.633297138331284e-07,
"loss": 0.0,
"num_tokens": 86212591.0,
"reward": 0.7109375,
"reward_std": 0.14913025498390198,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 541
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 5.323123171578266e-09,
"advantages/std": 0.43739479780197144,
"advantages/var": 0.19131420914422748,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.3159018143009606,
"grad_norm": 0.21369706487245865,
"learning_rate": 5.620034077357707e-07,
"loss": 0.0,
"num_tokens": 86356705.0,
"reward": 0.80859375,
"reward_std": 0.09324727952480316,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 542
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.473288895977308e-09,
"advantages/std": 0.4675827622413635,
"advantages/var": 0.2186336395452635,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.32017075773746,
"grad_norm": 0.25034432748651897,
"learning_rate": 5.606766583980389e-07,
"loss": 0.0,
"num_tokens": 86514364.0,
"reward": 0.6640625,
"reward_std": 0.09522314369678497,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 543
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.562896688905427e-09,
"advantages/std": 0.5227895379066467,
"advantages/var": 0.2733089009446452,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.3244397011739593,
"grad_norm": 0.2590621429358719,
"learning_rate": 5.593494753043937e-07,
"loss": 0.0,
"num_tokens": 86681566.0,
"reward": 0.6171875,
"reward_std": 0.13664263486862183,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 544
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.225151187992327e-09,
"advantages/std": 0.495952844619751,
"advantages/var": 0.24596922408642286,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.328708644610459,
"grad_norm": 0.2321484076747434,
"learning_rate": 5.580218679423964e-07,
"loss": -0.0,
"num_tokens": 86816847.0,
"reward": 0.79296875,
"reward_std": 0.11310647428035736,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 545
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.958639405622186e-10,
"advantages/std": 0.46759527921676636,
"advantages/var": 0.2186453451458057,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 2.3329775880469583,
"grad_norm": 0.24188330925260715,
"learning_rate": 5.56693845802641e-07,
"loss": -0.0,
"num_tokens": 86987597.0,
"reward": 0.69140625,
"reward_std": 0.10824117809534073,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 546
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 4.979341600636731e-09,
"advantages/std": 0.4675932228565216,
"advantages/var": 0.21864342206134868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.3372465314834576,
"grad_norm": 0.17450798270363213,
"learning_rate": 5.553654183786871e-07,
"loss": -0.0,
"num_tokens": 87155677.0,
"reward": 0.671875,
"reward_std": 0.1054728701710701,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 547
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 6.298381478091215e-10,
"advantages/std": 0.3696674108505249,
"advantages/var": 0.13665399464493078,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.3415154749199574,
"grad_norm": 0.16176405546533756,
"learning_rate": 5.540365951669912e-07,
"loss": -0.0,
"num_tokens": 87298798.0,
"reward": 0.734375,
"reward_std": 0.0677327960729599,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 548
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.9061065803549066e-10,
"advantages/std": 0.5960683226585388,
"advantages/var": 0.35529744527696394,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 2.3457844183564567,
"grad_norm": 0.2665223749786248,
"learning_rate": 5.527073856668391e-07,
"loss": -0.0,
"num_tokens": 87478889.0,
"reward": 0.6328125,
"reward_std": 0.1732056736946106,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 549
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 2.1293189085785e-09,
"advantages/std": 0.43738049268722534,
"advantages/var": 0.19130169538331998,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.3500533617929564,
"grad_norm": 0.2916496247007535,
"learning_rate": 5.51377799380278e-07,
"loss": -0.0,
"num_tokens": 87626632.0,
"reward": 0.65625,
"reward_std": 0.08075720071792603,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 550
},
{
"advantages/mean": 3.259629011154175e-09,
"advantages/snr": 5.468468828956508e-09,
"advantages/std": 0.5960770845413208,
"advantages/var": 0.3553078907152809,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 2.3543223052294557,
"grad_norm": 0.2702531847026088,
"learning_rate": 5.500478458120493e-07,
"loss": -0.0,
"num_tokens": 87803587.0,
"reward": 0.640625,
"reward_std": 0.18451623618602753,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 551
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.235199298796701e-09,
"advantages/std": 0.5227786302566528,
"advantages/var": 0.27329749625302213,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.358591248665955,
"grad_norm": 0.2505033849756049,
"learning_rate": 5.487175344695187e-07,
"loss": -0.0,
"num_tokens": 87974455.0,
"reward": 0.75,
"reward_std": 0.12415501475334167,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 552
},
{
"advantages/mean": -7.2177499532699585e-09,
"advantages/snr": 1.0589000793862824e-08,
"advantages/std": 0.681627094745636,
"advantages/var": 0.4646154962913762,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 2.362860192102455,
"grad_norm": 0.35984344725559825,
"learning_rate": 5.473868748626109e-07,
"loss": 0.0,
"num_tokens": 88129378.0,
"reward": 0.7734375,
"reward_std": 0.2225247025489807,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 553
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.450106882338915e-09,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.367129135538954,
"grad_norm": 0.25738104886057167,
"learning_rate": 5.460558765037392e-07,
"loss": 0.0,
"num_tokens": 88277805.0,
"reward": 0.69140625,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 554
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.09823736693788e-09,
"advantages/std": 0.5726998448371887,
"advantages/var": 0.32798511227654004,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.3713980789754534,
"grad_norm": 0.3490442107506381,
"learning_rate": 5.447245489077388e-07,
"loss": 0.0,
"num_tokens": 88433920.0,
"reward": 0.73046875,
"reward_std": 0.17965340614318848,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 555
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.789750026296212e-09,
"advantages/std": 0.5227856636047363,
"advantages/var": 0.27330485007064453,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.375667022411953,
"grad_norm": 0.28529706683678874,
"learning_rate": 5.433929015917988e-07,
"loss": -0.0,
"num_tokens": 88596517.0,
"reward": 0.65625,
"reward_std": 0.13269482553005219,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 556
},
{
"advantages/mean": -6.05359673500061e-09,
"advantages/snr": 1.1040569632820925e-08,
"advantages/std": 0.5483047366142273,
"advantages/var": 0.30063808419359717,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.3799359658484525,
"grad_norm": 0.2523351201696841,
"learning_rate": 5.420609440753935e-07,
"loss": 0.0,
"num_tokens": 88737878.0,
"reward": 0.78515625,
"reward_std": 0.14886823296546936,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 557
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.008234554568749e-09,
"advantages/std": 0.5227926969528198,
"advantages/var": 0.2733122039872029,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.3842049092849518,
"grad_norm": 0.28765703107991825,
"learning_rate": 5.407286858802147e-07,
"loss": 0.0,
"num_tokens": 88887978.0,
"reward": 0.7265625,
"reward_std": 0.13952961564064026,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 558
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.3969473161275114e-09,
"advantages/std": 0.5483291149139404,
"advantages/var": 0.3006648182623053,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.3884738527214515,
"grad_norm": 0.2284492587672275,
"learning_rate": 5.393961365301041e-07,
"loss": -0.0,
"num_tokens": 89058883.0,
"reward": 0.62109375,
"reward_std": 0.1777912974357605,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 559
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.273916450676268e-09,
"advantages/std": 0.5483027696609497,
"advantages/var": 0.30063592721786847,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.392742796157951,
"grad_norm": 0.3098345163048042,
"learning_rate": 5.380633055509842e-07,
"loss": 0.0,
"num_tokens": 89222944.0,
"reward": 0.6640625,
"reward_std": 0.1459837108850479,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 560
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.763938073065637e-09,
"advantages/std": 0.618582546710968,
"advantages/var": 0.38264436709542693,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.3970117395944506,
"grad_norm": 0.29717892647935373,
"learning_rate": 5.36730202470791e-07,
"loss": 0.0,
"num_tokens": 89380376.0,
"reward": 0.67578125,
"reward_std": 0.20357662439346313,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 561
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.899073991974699e-09,
"advantages/std": 0.5227798223495483,
"advantages/var": 0.2732987426558253,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 2.40128068303095,
"grad_norm": 0.25529005934544713,
"learning_rate": 5.35396836819406e-07,
"loss": 0.0,
"num_tokens": 89535948.0,
"reward": 0.69921875,
"reward_std": 0.1258624941110611,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 562
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2739268370849381e-09,
"advantages/std": 0.5482982993125916,
"advantages/var": 0.30063102502908023,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.405549626467449,
"grad_norm": 0.3079728383819247,
"learning_rate": 5.340632181285871e-07,
"loss": 0.0,
"num_tokens": 89695747.0,
"reward": 0.7265625,
"reward_std": 0.14085885882377625,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 563
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.453712769274014e-10,
"advantages/std": 0.5227787494659424,
"advantages/var": 0.27329762089317455,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 2.409818569903949,
"grad_norm": 0.28409215670000054,
"learning_rate": 5.327293559319013e-07,
"loss": 0.0,
"num_tokens": 89842574.0,
"reward": 0.796875,
"reward_std": 0.1258600354194641,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 564
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196682913687935e-09,
"advantages/std": 0.5726900696754456,
"advantages/var": 0.3279739159048667,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.4140875133404482,
"grad_norm": 0.3257234308296981,
"learning_rate": 5.313952597646567e-07,
"loss": 0.0,
"num_tokens": 90003355.0,
"reward": 0.63671875,
"reward_std": 0.1655769646167755,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 565
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.5201672004113415e-09,
"advantages/std": 0.5483164191246033,
"advantages/var": 0.3006508954816276,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.4183564567769475,
"grad_norm": 0.19915846849528268,
"learning_rate": 5.300609391638335e-07,
"loss": 0.0,
"num_tokens": 90168401.0,
"reward": 0.75390625,
"reward_std": 0.1618887335062027,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 566
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.877808186781896e-09,
"advantages/std": 0.49596256017684937,
"advantages/var": 0.24597886109717493,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 2.4226254002134473,
"grad_norm": 0.23995835912984506,
"learning_rate": 5.287264036680165e-07,
"loss": -0.0,
"num_tokens": 90325006.0,
"reward": 0.74609375,
"reward_std": 0.1249450072646141,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 567
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.439358924571688e-09,
"advantages/std": 0.5726848244667053,
"advantages/var": 0.3279679081744611,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.4268943436499466,
"grad_norm": 0.26202942056692013,
"learning_rate": 5.273916628173269e-07,
"loss": -0.0,
"num_tokens": 90488837.0,
"reward": 0.6953125,
"reward_std": 0.16044965386390686,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 568
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.7248786584154183e-09,
"advantages/std": 0.4049513339996338,
"advantages/var": 0.16398558290808296,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.431163287086446,
"grad_norm": 0.21189495244336484,
"learning_rate": 5.260567261533537e-07,
"loss": 0.0,
"num_tokens": 90639437.0,
"reward": 0.77734375,
"reward_std": 0.08166831731796265,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 569
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.098269735979208e-09,
"advantages/std": 0.5726968050003052,
"advantages/var": 0.32798163045755757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.4354322305229457,
"grad_norm": 0.25179541657971016,
"learning_rate": 5.247216032190853e-07,
"loss": 0.0,
"num_tokens": 90787086.0,
"reward": 0.765625,
"reward_std": 0.17464721202850342,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 570
},
{
"advantages/mean": -5.122274160385132e-09,
"advantages/snr": 9.797886581565322e-09,
"advantages/std": 0.5227937698364258,
"advantages/var": 0.27331332577978174,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.439701173959445,
"grad_norm": 0.2519623444961031,
"learning_rate": 5.233863035588426e-07,
"loss": 0.0,
"num_tokens": 90931241.0,
"reward": 0.79296875,
"reward_std": 0.13953207433223724,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 571
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.599778655449369e-09,
"advantages/std": 0.4049423336982727,
"advantages/var": 0.16397829362100325,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.4439701173959447,
"grad_norm": 0.18670377125438592,
"learning_rate": 5.220508367182089e-07,
"loss": 0.0,
"num_tokens": 91081000.0,
"reward": 0.79296875,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 572
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.49595820903778076,
"advantages/var": 0.24597454511196304,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.448239060832444,
"grad_norm": 0.22455891352376486,
"learning_rate": 5.207152122439635e-07,
"loss": 0.0,
"num_tokens": 91240089.0,
"reward": 0.671875,
"reward_std": 0.12046677619218826,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 573
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.7896780752123385e-09,
"advantages/std": 0.5227921605110168,
"advantages/var": 0.2733116430917768,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.4525080042689433,
"grad_norm": 0.26883169817327435,
"learning_rate": 5.193794396840116e-07,
"loss": 0.0,
"num_tokens": 91384725.0,
"reward": 0.83984375,
"reward_std": 0.13888297975063324,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"step": 574
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.794176017006791e-09,
"advantages/std": 0.548306405544281,
"advantages/var": 0.30063991436088955,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.456776947705443,
"grad_norm": 0.23086236527150877,
"learning_rate": 5.180435285873181e-07,
"loss": -0.0,
"num_tokens": 91543106.0,
"reward": 0.6875,
"reward_std": 0.14993153512477875,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 575
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.547886080441306e-09,
"advantages/std": 0.5482913255691528,
"advantages/var": 0.30062337769437875,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.4610458911419424,
"grad_norm": 0.22269172564936424,
"learning_rate": 5.167074885038372e-07,
"loss": 0.0,
"num_tokens": 91704890.0,
"reward": 0.68359375,
"reward_std": 0.13349363207817078,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 576
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.3277959731395836e-09,
"advantages/std": 0.701404869556427,
"advantages/var": 0.4919687910374684,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.4653148345784417,
"grad_norm": 0.2822230840932937,
"learning_rate": 5.153713289844461e-07,
"loss": 0.0,
"num_tokens": 91862016.0,
"reward": 0.68359375,
"reward_std": 0.2585534155368805,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 577
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.168819230075082e-09,
"advantages/std": 0.6612796783447266,
"advantages/var": 0.437290812991705,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.4695837780149414,
"grad_norm": 0.28122934660841753,
"learning_rate": 5.14035059580875e-07,
"loss": 0.0,
"num_tokens": 92026532.0,
"reward": 0.703125,
"reward_std": 0.21489356458187103,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 578
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.2596836075630617e-09,
"advantages/std": 0.369665265083313,
"advantages/var": 0.13665240820911606,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.4738527214514408,
"grad_norm": 0.1976911781209467,
"learning_rate": 5.1269868984564e-07,
"loss": -0.0,
"num_tokens": 92181793.0,
"reward": 0.65625,
"reward_std": 0.06549490243196487,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 579
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.781509278854418e-09,
"advantages/std": 0.5227716565132141,
"advantages/var": 0.2732902048535699,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 2.47812166488794,
"grad_norm": 0.28255159007796576,
"learning_rate": 5.113622293319749e-07,
"loss": -0.0,
"num_tokens": 92335607.0,
"reward": 0.6640625,
"reward_std": 0.1173202246427536,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 580
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 3.1491018816281693e-09,
"advantages/std": 0.3696778416633606,
"advantages/var": 0.1366617066168807,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.48239060832444,
"grad_norm": 0.22949898840587127,
"learning_rate": 5.100256875937613e-07,
"loss": -0.0,
"num_tokens": 92491950.0,
"reward": 0.68359375,
"reward_std": 0.07680301368236542,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 581
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5483025908470154,
"advantages/var": 0.30063573112954955,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.486659551760939,
"grad_norm": 0.2499851224101172,
"learning_rate": 5.086890741854626e-07,
"loss": -0.0,
"num_tokens": 92640587.0,
"reward": 0.78125,
"reward_std": 0.14757250249385834,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 582
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.6985493589245975e-09,
"advantages/std": 0.5483046770095825,
"advantages/var": 0.3006380188305826,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.490928495197439,
"grad_norm": 0.2736270277798583,
"learning_rate": 5.073523986620538e-07,
"loss": -0.0,
"num_tokens": 92790601.0,
"reward": 0.75390625,
"reward_std": 0.14716321229934692,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 583
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.3361071279842629e-09,
"advantages/std": 0.5227813720703125,
"advantages/var": 0.2733003629837185,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.495197438633938,
"grad_norm": 0.21115201968945138,
"learning_rate": 5.060156705789544e-07,
"loss": -0.0,
"num_tokens": 92958566.0,
"reward": 0.67578125,
"reward_std": 0.12810036540031433,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 584
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.065611325146664e-10,
"advantages/std": 0.5726829767227173,
"advantages/var": 0.32796579182799235,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 2.4994663820704375,
"grad_norm": 0.22795342467727167,
"learning_rate": 5.046788994919594e-07,
"loss": -0.0,
"num_tokens": 93128784.0,
"reward": 0.68359375,
"reward_std": 0.15927013754844666,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 585
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 5.8182291194331764e-09,
"advantages/std": 0.6402790546417236,
"advantages/var": 0.4099572678128993,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.503735325506937,
"grad_norm": 0.29052637275346876,
"learning_rate": 5.033420949571712e-07,
"loss": 0.0,
"num_tokens": 93290584.0,
"reward": 0.7109375,
"reward_std": 0.19818973541259766,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 586
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.249816775513951e-09,
"advantages/std": 0.5960639119148254,
"advantages/var": 0.3552921870872048,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.5080042689434365,
"grad_norm": 0.3227661999483521,
"learning_rate": 5.020052665309311e-07,
"loss": 0.0,
"num_tokens": 93444117.0,
"reward": 0.77734375,
"reward_std": 0.16755038499832153,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 587
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.077939062236641e-09,
"advantages/std": 0.596068263053894,
"advantages/var": 0.3552973742200862,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.512273212379936,
"grad_norm": 0.28291810883393154,
"learning_rate": 5.006684237697519e-07,
"loss": 0.0,
"num_tokens": 93600079.0,
"reward": 0.6875,
"reward_std": 0.1732056736946106,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 588
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.218654289967664e-09,
"advantages/std": 0.5483183860778809,
"advantages/var": 0.300653052511052,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.5165421558164356,
"grad_norm": 0.3046525980956177,
"learning_rate": 4.993315762302482e-07,
"loss": 0.0,
"num_tokens": 93756932.0,
"reward": 0.671875,
"reward_std": 0.1630682349205017,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 589
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.4938096208518726e-09,
"advantages/std": 0.46759098768234253,
"advantages/var": 0.2186413317617486,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.520811099252935,
"grad_norm": 0.23476904635933984,
"learning_rate": 4.979947334690689e-07,
"loss": -0.0,
"num_tokens": 93913252.0,
"reward": 0.64453125,
"reward_std": 0.10429336130619049,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 590
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.296726261184258e-09,
"advantages/std": 0.5960670709609985,
"advantages/var": 0.35529595308402406,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.525080042689434,
"grad_norm": 0.3037000991415936,
"learning_rate": 4.96657905042829e-07,
"loss": 0.0,
"num_tokens": 94069395.0,
"reward": 0.75,
"reward_std": 0.1726727932691574,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 591
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.492741255292345e-10,
"advantages/std": 0.5483050346374512,
"advantages/var": 0.30063841100877653,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 2.529348986125934,
"grad_norm": 0.221341914770484,
"learning_rate": 4.953211005080407e-07,
"loss": -0.0,
"num_tokens": 94243004.0,
"reward": 0.609375,
"reward_std": 0.14939865469932556,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 592
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.404949814081192,
"advantages/var": 0.16398435192439198,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 2.5336179295624333,
"grad_norm": 0.2662025714061716,
"learning_rate": 4.939843294210455e-07,
"loss": -0.0,
"num_tokens": 94405400.0,
"reward": 0.625,
"reward_std": 0.07996084541082382,
"rewards/drgrpo_math_reward/mean": 0.625,
"rewards/drgrpo_math_reward/std": 0.4850712716579437,
"step": 593
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1232210892462864e-09,
"advantages/std": 0.548295795917511,
"advantages/var": 0.30062827982081686,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.537886872998933,
"grad_norm": 0.27605722927884196,
"learning_rate": 4.926476013379462e-07,
"loss": 0.0,
"num_tokens": 94555071.0,
"reward": 0.66015625,
"reward_std": 0.13861851394176483,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 594
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.124909012720456e-09,
"advantages/std": 0.5960637927055359,
"advantages/var": 0.35529204497450806,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 2.5421558164354323,
"grad_norm": 0.2581301367381883,
"learning_rate": 4.913109258145374e-07,
"loss": 0.0,
"num_tokens": 94715089.0,
"reward": 0.66796875,
"reward_std": 0.16913917660713196,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 595
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.4896708003183657e-09,
"advantages/std": 0.4675932228565216,
"advantages/var": 0.21864342206134868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.5464247598719316,
"grad_norm": 0.20105910467908297,
"learning_rate": 4.899743124062388e-07,
"loss": 0.0,
"num_tokens": 94868432.0,
"reward": 0.734375,
"reward_std": 0.1054728776216507,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 596
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.322974107616147e-10,
"advantages/std": 0.4374070465564728,
"advantages/var": 0.19132492437725634,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.550693703308431,
"grad_norm": 0.22232398034984727,
"learning_rate": 4.886377706680252e-07,
"loss": -0.0,
"num_tokens": 95019364.0,
"reward": 0.72265625,
"reward_std": 0.1046740710735321,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 597
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.252490752707826e-09,
"advantages/std": 0.5726826786994934,
"advantages/var": 0.3279654504824272,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 2.5549626467449307,
"grad_norm": 0.24824824609667348,
"learning_rate": 4.873013101543599e-07,
"loss": 0.0,
"num_tokens": 95202253.0,
"reward": 0.53125,
"reward_std": 0.15873973071575165,
"rewards/drgrpo_math_reward/mean": 0.53125,
"rewards/drgrpo_math_reward/std": 0.5,
"step": 598
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.246425098394015e-09,
"advantages/std": 0.5482980012893677,
"advantages/var": 0.30063069821791544,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.55923159018143,
"grad_norm": 0.28294020874569886,
"learning_rate": 4.859649404191251e-07,
"loss": -0.0,
"num_tokens": 95365009.0,
"reward": 0.75390625,
"reward_std": 0.14032843708992004,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 599
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.008258775018605e-09,
"advantages/std": 0.5227895379066467,
"advantages/var": 0.2733089009446452,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.5635005336179297,
"grad_norm": 0.3325270821562332,
"learning_rate": 4.846286710155539e-07,
"loss": -0.0,
"num_tokens": 95507354.0,
"reward": 0.765625,
"reward_std": 0.13664263486862183,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 600
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.95873080567915e-10,
"advantages/std": 0.46759098768234253,
"advantages/var": 0.2186413317617486,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.567769477054429,
"grad_norm": 0.2593795240903901,
"learning_rate": 4.832925114961628e-07,
"loss": -0.0,
"num_tokens": 95676481.0,
"reward": 0.63671875,
"reward_std": 0.10429336875677109,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 601
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.882008025557002e-09,
"advantages/std": 0.6185697317123413,
"advantages/var": 0.3826285129906779,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 2.5720384204909283,
"grad_norm": 0.338763944385851,
"learning_rate": 4.819564714126818e-07,
"loss": 0.0,
"num_tokens": 95840225.0,
"reward": 0.6015625,
"reward_std": 0.18819957971572876,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"step": 602
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983547925259405e-09,
"advantages/std": 0.4675844609737396,
"advantages/var": 0.21863522814410263,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.576307363927428,
"grad_norm": 0.22735309130132134,
"learning_rate": 4.806205603159882e-07,
"loss": 0.0,
"num_tokens": 96008068.0,
"reward": 0.6875,
"reward_std": 0.0974610224366188,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 603
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.987569352881188e-09,
"advantages/std": 0.4675987958908081,
"advantages/var": 0.21864863391853362,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.5805763073639274,
"grad_norm": 0.23837085253396123,
"learning_rate": 4.792847877560366e-07,
"loss": -0.0,
"num_tokens": 96176960.0,
"reward": 0.55859375,
"reward_std": 0.11112815886735916,
"rewards/drgrpo_math_reward/mean": 0.55859375,
"rewards/drgrpo_math_reward/std": 0.4975275993347168,
"step": 604
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.882008932297716e-09,
"advantages/std": 0.6185694336891174,
"advantages/var": 0.38262814429447545,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.584845250800427,
"grad_norm": 0.3473234539247001,
"learning_rate": 4.779491632817911e-07,
"loss": 0.0,
"num_tokens": 96317343.0,
"reward": 0.73828125,
"reward_std": 0.18766915798187256,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 605
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.687352269762781e-09,
"advantages/std": 0.5960652232170105,
"advantages/var": 0.35529375032874455,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.5891141942369265,
"grad_norm": 0.29438078792503247,
"learning_rate": 4.766136964411575e-07,
"loss": 0.0,
"num_tokens": 96467809.0,
"reward": 0.69921875,
"reward_std": 0.16978827118873596,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 606
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.6722398485864743e-09,
"advantages/std": 0.5227763652801514,
"advantages/var": 0.27329512809552625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.5933831376734258,
"grad_norm": 0.2350516882093116,
"learning_rate": 4.752783967809146e-07,
"loss": 0.0,
"num_tokens": 96612952.0,
"reward": 0.765625,
"reward_std": 0.12244509160518646,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 607
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.991740829431179e-09,
"advantages/std": 0.4675922393798828,
"advantages/var": 0.21864250232829363,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.597652081109925,
"grad_norm": 0.2820015262201967,
"learning_rate": 4.7394327384664647e-07,
"loss": -0.0,
"num_tokens": 96769729.0,
"reward": 0.6484375,
"reward_std": 0.10429581999778748,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 608
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.408338946629123e-09,
"advantages/std": 0.33064574003219604,
"advantages/var": 0.10932660540143857,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 2.601921024546425,
"grad_norm": 0.15699078968341892,
"learning_rate": 4.7260833718267303e-07,
"loss": -0.0,
"num_tokens": 96905296.0,
"reward": 0.69921875,
"reward_std": 0.05786130577325821,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 609
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.2524545316557682e-09,
"advantages/std": 0.5726890563964844,
"advantages/var": 0.32797275531629566,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 2.606189967982924,
"grad_norm": 0.2880655388696752,
"learning_rate": 4.712735963319833e-07,
"loss": -0.0,
"num_tokens": 97084796.0,
"reward": 0.5703125,
"reward_std": 0.16557452082633972,
"rewards/drgrpo_math_reward/mean": 0.5703125,
"rewards/drgrpo_math_reward/std": 0.4960011839866638,
"step": 610
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.906074942309521e-09,
"advantages/std": 0.5960731506347656,
"advantages/var": 0.355303200907656,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 2.610458911419424,
"grad_norm": 0.33861166709668683,
"learning_rate": 4.699390608361665e-07,
"loss": 0.0,
"num_tokens": 97262848.0,
"reward": 0.6640625,
"reward_std": 0.17950759828090668,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 611
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.547755075542108e-09,
"advantages/std": 0.5483195185661316,
"advantages/var": 0.3006542944405943,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.614727854855923,
"grad_norm": 0.2364570063084319,
"learning_rate": 4.686047402353433e-07,
"loss": 0.0,
"num_tokens": 97422059.0,
"reward": 0.81640625,
"reward_std": 0.16477571427822113,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 612
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.511102309549375e-09,
"advantages/std": 0.4959711730480194,
"advantages/var": 0.24598740449462841,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.6189967982924225,
"grad_norm": 0.2514100284273653,
"learning_rate": 4.672706440680988e-07,
"loss": -0.0,
"num_tokens": 97570673.0,
"reward": 0.78125,
"reward_std": 0.133487269282341,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 613
},
{
"advantages/mean": 3.725290298461914e-09,
"advantages/snr": 9.1991450029823e-09,
"advantages/std": 0.4049604833126068,
"advantages/var": 0.1639929930447801,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 2.6232657417289222,
"grad_norm": 0.22105331477881468,
"learning_rate": 4.6593678187141296e-07,
"loss": -0.0,
"num_tokens": 97730199.0,
"reward": 0.57421875,
"reward_std": 0.09020812809467316,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 614
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.4083475927177635e-09,
"advantages/std": 0.4959655702114105,
"advantages/var": 0.24598184683512958,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.6275346851654215,
"grad_norm": 0.2662951442220497,
"learning_rate": 4.6460316318059394e-07,
"loss": -0.0,
"num_tokens": 97874707.0,
"reward": 0.7421875,
"reward_std": 0.12730157375335693,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 615
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983392291580966e-09,
"advantages/std": 0.4676027297973633,
"advantages/var": 0.21865231291394593,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.6318036286019213,
"grad_norm": 0.2441469783117011,
"learning_rate": 4.63269797529209e-07,
"loss": 0.0,
"num_tokens": 98028818.0,
"reward": 0.71875,
"reward_std": 0.1145455539226532,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 616
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.9916976690634064e-09,
"advantages/std": 0.46760237216949463,
"advantages/var": 0.21865197845853857,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.6360725720384206,
"grad_norm": 0.23198391252921965,
"learning_rate": 4.619366944490157e-07,
"loss": 0.0,
"num_tokens": 98180489.0,
"reward": 0.73828125,
"reward_std": 0.1140151396393776,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 617
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.9917366402546925e-09,
"advantages/std": 0.4675932228565216,
"advantages/var": 0.21864342206134868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.64034151547492,
"grad_norm": 0.2740599648998935,
"learning_rate": 4.60603863469896e-07,
"loss": 0.0,
"num_tokens": 98342280.0,
"reward": 0.734375,
"reward_std": 0.1054728776216507,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 618
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.032780062477822e-09,
"advantages/std": 0.5726901888847351,
"advantages/var": 0.3279740524448336,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.644610458911419,
"grad_norm": 0.29072671404773964,
"learning_rate": 4.592713141197853e-07,
"loss": -0.0,
"num_tokens": 98500354.0,
"reward": 0.75390625,
"reward_std": 0.16898700594902039,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 619
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.49255107598981e-10,
"advantages/std": 0.5483173131942749,
"advantages/var": 0.30065187594858855,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 2.648879402347919,
"grad_norm": 0.2904695592467313,
"learning_rate": 4.5793905592460655e-07,
"loss": 0.0,
"num_tokens": 98656811.0,
"reward": 0.78515625,
"reward_std": 0.16306579113006592,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 620
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.520222407296989e-09,
"advantages/std": 0.5483109354972839,
"advantages/var": 0.30064488198590666,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 2.6531483457844183,
"grad_norm": 0.2600164832503189,
"learning_rate": 4.566070984082013e-07,
"loss": 0.0,
"num_tokens": 98802760.0,
"reward": 0.78125,
"reward_std": 0.15676140785217285,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 621
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.408326774808376e-09,
"advantages/std": 0.4959729015827179,
"advantages/var": 0.24598911910438037,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.657417289220918,
"grad_norm": 0.21213210045500452,
"learning_rate": 4.5527545109226116e-07,
"loss": 0.0,
"num_tokens": 98945541.0,
"reward": 0.71875,
"reward_std": 0.134136363863945,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 622
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.336089152630887e-09,
"advantages/std": 0.522788405418396,
"advantages/var": 0.2733077168399092,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.6616862326574173,
"grad_norm": 0.24553847372710627,
"learning_rate": 4.5394412349626086e-07,
"loss": 0.0,
"num_tokens": 99089384.0,
"reward": 0.74609375,
"reward_std": 0.13664019107818604,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 623
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.898971776345557e-09,
"advantages/std": 0.5227907299995422,
"advantages/var": 0.27331014737345427,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.6659551760939166,
"grad_norm": 0.253230056323244,
"learning_rate": 4.5261312513738915e-07,
"loss": 0.0,
"num_tokens": 99238259.0,
"reward": 0.77734375,
"reward_std": 0.13835011422634125,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 624
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.547766430613214e-09,
"advantages/std": 0.5483170747756958,
"advantages/var": 0.300651614490576,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.6702241195304164,
"grad_norm": 0.28244036585218013,
"learning_rate": 4.5128246553048127e-07,
"loss": -0.0,
"num_tokens": 99411206.0,
"reward": 0.61328125,
"reward_std": 0.1629495769739151,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 625
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.3472868915768388e-09,
"advantages/std": 0.4959569275379181,
"advantages/var": 0.24597327397285174,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.6744930629669157,
"grad_norm": 0.2155500491493238,
"learning_rate": 4.499521541879508e-07,
"loss": 0.0,
"num_tokens": 99567544.0,
"reward": 0.68359375,
"reward_std": 0.11875930428504944,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 626
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.755631042461607e-09,
"advantages/std": 0.49596062302589417,
"advantages/var": 0.2459769395922331,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 2.678762006403415,
"grad_norm": 0.23411680945362368,
"learning_rate": 4.486222006197219e-07,
"loss": 0.0,
"num_tokens": 99728810.0,
"reward": 0.6875,
"reward_std": 0.12217670679092407,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 627
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.117555573805927e-09,
"advantages/std": 0.522786021232605,
"advantages/var": 0.2733052239962177,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.6830309498399147,
"grad_norm": 0.30473616588248703,
"learning_rate": 4.472926143331611e-07,
"loss": -0.0,
"num_tokens": 99893410.0,
"reward": 0.62109375,
"reward_std": 0.1332252472639084,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 628
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.972431973776251e-09,
"advantages/std": 0.5483101010322571,
"advantages/var": 0.30064396689400397,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.687299893276414,
"grad_norm": 0.2796635033475051,
"learning_rate": 4.459634048330088e-07,
"loss": 0.0,
"num_tokens": 100042004.0,
"reward": 0.7421875,
"reward_std": 0.15558435022830963,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 629
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.323229080974593e-10,
"advantages/std": 0.43738609552383423,
"advantages/var": 0.19130659655758464,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 2.6915688367129134,
"grad_norm": 0.25731141790066964,
"learning_rate": 4.4463458162131293e-07,
"loss": 0.0,
"num_tokens": 100195781.0,
"reward": 0.6875,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 630
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.8778354937898005e-09,
"advantages/std": 0.49595534801483154,
"advantages/var": 0.24597170722451267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.695837780149413,
"grad_norm": 0.23915462953716904,
"learning_rate": 4.43306154197359e-07,
"loss": -0.0,
"num_tokens": 100357115.0,
"reward": 0.64453125,
"reward_std": 0.11652141809463501,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 631
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.8748187096312636e-09,
"advantages/std": 0.4049483835697174,
"advantages/var": 0.16398319335572697,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.578125,
"epoch": 2.7001067235859124,
"grad_norm": 0.22316868876816248,
"learning_rate": 4.4197813205760363e-07,
"loss": -0.0,
"num_tokens": 100518712.0,
"reward": 0.66796875,
"reward_std": 0.07995839416980743,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 632
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 8.907147278172826e-10,
"advantages/std": 0.5227950811386108,
"advantages/var": 0.2733146968627267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.704375667022412,
"grad_norm": 0.2671767594650501,
"learning_rate": 4.4065052469560634e-07,
"loss": -0.0,
"num_tokens": 100663102.0,
"reward": 0.765625,
"reward_std": 0.1429445743560791,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 633
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.8895334850316917e-09,
"advantages/std": 0.36966368556022644,
"advantages/var": 0.13665124042196997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.640625,
"epoch": 2.7086446104589115,
"grad_norm": 0.20871265591814175,
"learning_rate": 4.3932334160196105e-07,
"loss": -0.0,
"num_tokens": 100826005.0,
"reward": 0.59765625,
"reward_std": 0.06549245119094849,
"rewards/drgrpo_math_reward/mean": 0.59765625,
"rewards/drgrpo_math_reward/std": 0.4913311004638672,
"step": 634
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 3.726209577886217e-09,
"advantages/std": 0.43739205598831177,
"advantages/var": 0.19131181064168246,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.712913553895411,
"grad_norm": 0.2626117494758664,
"learning_rate": 4.3799659226422934e-07,
"loss": -0.0,
"num_tokens": 100961517.0,
"reward": 0.80078125,
"reward_std": 0.09153735637664795,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 635
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.7249169957935134e-09,
"advantages/std": 0.4049423336982727,
"advantages/var": 0.16397829362100325,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.71718249733191,
"grad_norm": 0.16295120586308373,
"learning_rate": 4.3667028616687156e-07,
"loss": 0.0,
"num_tokens": 101131306.0,
"reward": 0.64453125,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 636
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.7555197873748836e-09,
"advantages/std": 0.4959753155708313,
"advantages/var": 0.2459915136555857,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.59375,
"epoch": 2.72145144076841,
"grad_norm": 0.22277699164789738,
"learning_rate": 4.3534443279117966e-07,
"loss": -0.0,
"num_tokens": 101306566.0,
"reward": 0.5625,
"reward_std": 0.13584628701210022,
"rewards/drgrpo_math_reward/mean": 0.5625,
"rewards/drgrpo_math_reward/std": 0.49705013632774353,
"step": 637
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5726798176765442,
"advantages/var": 0.3279621735740399,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.725720384204909,
"grad_norm": 0.27680391145365696,
"learning_rate": 4.3401904161520943e-07,
"loss": 0.0,
"num_tokens": 101465332.0,
"reward": 0.71875,
"reward_std": 0.15585274994373322,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 638
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.6590501926319976e-09,
"advantages/std": 0.5726829767227173,
"advantages/var": 0.32796579182799235,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.729989327641409,
"grad_norm": 0.295537748982643,
"learning_rate": 4.3269412211371207e-07,
"loss": 0.0,
"num_tokens": 101617777.0,
"reward": 0.63671875,
"reward_std": 0.15927013754844666,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 639
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.47316264651047e-09,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.734258271077908,
"grad_norm": 0.2862836629920742,
"learning_rate": 4.3136968375806764e-07,
"loss": 0.0,
"num_tokens": 101763561.0,
"reward": 0.78515625,
"reward_std": 0.10376539826393127,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 640
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.2523525863272304e-09,
"advantages/std": 0.2863534986972809,
"advantages/var": 0.08199832621617364,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.7385272145144075,
"grad_norm": 0.14058634688523225,
"learning_rate": 4.3004573601621576e-07,
"loss": -0.0,
"num_tokens": 101889882.0,
"reward": 0.8046875,
"reward_std": 0.0468127615749836,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 641
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.49596306681632996,
"advantages/var": 0.24597936364585937,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.7427961579509073,
"grad_norm": 0.2910648666966365,
"learning_rate": 4.287222883525896e-07,
"loss": 0.0,
"num_tokens": 102029621.0,
"reward": 0.8046875,
"reward_std": 0.12388662248849869,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 642
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.472025841892847e-09,
"advantages/std": 0.5727017521858215,
"advantages/var": 0.32798729695671014,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.7470651013874066,
"grad_norm": 0.31586403545400615,
"learning_rate": 4.2739935022804753e-07,
"loss": 0.0,
"num_tokens": 102187460.0,
"reward": 0.71875,
"reward_std": 0.18253791332244873,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 643
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 7.8121162940244e-09,
"advantages/std": 0.596075713634491,
"advantages/var": 0.3553062563848677,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.7513340448239063,
"grad_norm": 0.297871265469956,
"learning_rate": 4.260769310998043e-07,
"loss": 0.0,
"num_tokens": 102336263.0,
"reward": 0.75,
"reward_std": 0.1822783350944519,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 644
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.5208473665728075e-09,
"advantages/std": 0.33064574003219604,
"advantages/var": 0.10932660540143857,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.7556029882604056,
"grad_norm": 0.15027070320158128,
"learning_rate": 4.247550404213661e-07,
"loss": -0.0,
"num_tokens": 102491662.0,
"reward": 0.76171875,
"reward_std": 0.05786130577325821,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 645
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.562930811330029e-09,
"advantages/std": 0.5227845311164856,
"advantages/var": 0.2733036659746837,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.759871931696905,
"grad_norm": 0.27059176426281184,
"learning_rate": 4.2343368764245994e-07,
"loss": -0.0,
"num_tokens": 102637046.0,
"reward": 0.81640625,
"reward_std": 0.13098736107349396,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 646
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.344371843712344e-09,
"advantages/std": 0.5227869153022766,
"advantages/var": 0.27330615881126974,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.7641408751334042,
"grad_norm": 0.25141226205888784,
"learning_rate": 4.221128822089687e-07,
"loss": 0.0,
"num_tokens": 102788775.0,
"reward": 0.86328125,
"reward_std": 0.1344023048877716,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"step": 647
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.258416134281467e-09,
"advantages/std": 0.43740326166152954,
"advantages/var": 0.19132161331214448,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 2.768409818569904,
"grad_norm": 0.20481933865285598,
"learning_rate": 4.207926335628617e-07,
"loss": -0.0,
"num_tokens": 102947269.0,
"reward": 0.71484375,
"reward_std": 0.10178709030151367,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 648
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.0111997838878266e-09,
"advantages/std": 0.6185724139213562,
"advantages/var": 0.3826318312644936,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.7726787620064033,
"grad_norm": 0.31884596247892505,
"learning_rate": 4.1947295114212847e-07,
"loss": 0.0,
"num_tokens": 103114323.0,
"reward": 0.71875,
"reward_std": 0.1910865604877472,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 649
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.5049693187805794e-09,
"advantages/std": 0.5726837515830994,
"advantages/var": 0.32796667932729306,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.776947705442903,
"grad_norm": 0.2868042474367118,
"learning_rate": 4.1815384438071086e-07,
"loss": 0.0,
"num_tokens": 103281877.0,
"reward": 0.66796875,
"reward_std": 0.16044721007347107,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 650
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983533706996105e-09,
"advantages/std": 0.46758612990379333,
"advantages/var": 0.2186367888784071,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.7812166488794023,
"grad_norm": 0.2693976494278633,
"learning_rate": 4.1683532270843495e-07,
"loss": -0.0,
"num_tokens": 103432427.0,
"reward": 0.7109375,
"reward_std": 0.09969891607761383,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 651
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.439349784680999e-09,
"advantages/std": 0.5726869702339172,
"advantages/var": 0.3279703658757036,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.7854855923159016,
"grad_norm": 0.3233577993325011,
"learning_rate": 4.155173955509449e-07,
"loss": -0.0,
"num_tokens": 103589790.0,
"reward": 0.6328125,
"reward_std": 0.1638646125793457,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 652
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.0646176705942432e-09,
"advantages/std": 0.43739765882492065,
"advantages/var": 0.1913167119455217,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.7897545357524014,
"grad_norm": 0.1948590790756499,
"learning_rate": 4.1420007232963435e-07,
"loss": 0.0,
"num_tokens": 103738326.0,
"reward": 0.80078125,
"reward_std": 0.09666222333908081,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 653
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.4854458187541657e-09,
"advantages/std": 0.46760573983192444,
"advantages/var": 0.2186551279237614,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 2.7940234791889007,
"grad_norm": 0.22912033472464882,
"learning_rate": 4.1288336246157996e-07,
"loss": 0.0,
"num_tokens": 103899619.0,
"reward": 0.73046875,
"reward_std": 0.11849091947078705,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 654
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.9794387150541465e-09,
"advantages/std": 0.46758410334587097,
"advantages/var": 0.21863489370176215,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.7982924226254005,
"grad_norm": 0.32781694472570583,
"learning_rate": 4.1156727535947383e-07,
"loss": -0.0,
"num_tokens": 104021397.0,
"reward": 0.84765625,
"reward_std": 0.0969306081533432,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"step": 655
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.63477036056223e-09,
"advantages/std": 0.6185793280601501,
"advantages/var": 0.38264038510334686,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.8025613660618998,
"grad_norm": 0.39977204866692856,
"learning_rate": 4.1025182043155545e-07,
"loss": -0.0,
"num_tokens": 104164304.0,
"reward": 0.76171875,
"reward_std": 0.1996288150548935,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 656
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 9.389194959261648e-10,
"advantages/std": 0.4959544241428375,
"advantages/var": 0.24597079082685358,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 2.806830309498399,
"grad_norm": 0.2942172566228977,
"learning_rate": 4.089370070815462e-07,
"loss": -0.0,
"num_tokens": 104312308.0,
"reward": 0.73828125,
"reward_std": 0.11534436047077179,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 657
},
{
"advantages/mean": -6.05359673500061e-09,
"advantages/snr": 1.383982320429446e-08,
"advantages/std": 0.43740418553352356,
"advantages/var": 0.1913224215222451,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.8110992529348984,
"grad_norm": 0.19964071208461728,
"learning_rate": 4.0762284470857995e-07,
"loss": 0.0,
"num_tokens": 104457041.0,
"reward": 0.75390625,
"reward_std": 0.10125912725925446,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 658
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 3.9060479917862134e-10,
"advantages/std": 0.5960772633552551,
"advantages/var": 0.3553081038890902,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.815368196371398,
"grad_norm": 0.32815408933650064,
"learning_rate": 4.0630934270713755e-07,
"loss": 0.0,
"num_tokens": 104613246.0,
"reward": 0.7109375,
"reward_std": 0.18463245034217834,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 659
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.7342417965515424e-09,
"advantages/std": 0.5960754752159119,
"advantages/var": 0.35530597215387516,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.8196371398078974,
"grad_norm": 0.2767264996596556,
"learning_rate": 4.0499651046697946e-07,
"loss": -0.0,
"num_tokens": 104773937.0,
"reward": 0.71484375,
"reward_std": 0.1817479282617569,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 660
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.347298598764249e-09,
"advantages/std": 0.4959544539451599,
"advantages/var": 0.24597082038804174,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 2.823906083244397,
"grad_norm": 0.2577226845041563,
"learning_rate": 4.036843573730773e-07,
"loss": 0.0,
"num_tokens": 104928752.0,
"reward": 0.71484375,
"reward_std": 0.11534436047077179,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 661
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.680367313377009e-09,
"advantages/std": 0.5227945446968079,
"advantages/var": 0.27331413596474263,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.8281750266808965,
"grad_norm": 0.29370182040314274,
"learning_rate": 4.0237289280554853e-07,
"loss": 0.0,
"num_tokens": 105074789.0,
"reward": 0.77734375,
"reward_std": 0.1422979235649109,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 662
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.489668896152697e-09,
"advantages/std": 0.46759358048439026,
"advantages/var": 0.21864375651021195,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.832443970117396,
"grad_norm": 0.19437045676291703,
"learning_rate": 4.0106212613958796e-07,
"loss": -0.0,
"num_tokens": 105229364.0,
"reward": 0.66015625,
"reward_std": 0.1060032919049263,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 663
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.6945125683615966e-09,
"advantages/std": 0.4959633946418762,
"advantages/var": 0.24597968882469345,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 2.8367129135538955,
"grad_norm": 0.2647116216149255,
"learning_rate": 3.9975206674540196e-07,
"loss": -0.0,
"num_tokens": 105382844.0,
"reward": 0.75390625,
"reward_std": 0.12441704422235489,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 664
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.6985691161132508e-09,
"advantages/std": 0.5482982993125916,
"advantages/var": 0.30063102502908023,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.840981856990395,
"grad_norm": 0.3388296310037995,
"learning_rate": 3.9844272398814026e-07,
"loss": 0.0,
"num_tokens": 105524271.0,
"reward": 0.828125,
"reward_std": 0.14085884392261505,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 665
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.8182126795653633e-09,
"advantages/std": 0.64027339220047,
"advantages/var": 0.40995001675989684,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.8452508004268946,
"grad_norm": 0.36566203134323744,
"learning_rate": 3.9713410722783014e-07,
"loss": -0.0,
"num_tokens": 105677187.0,
"reward": 0.6796875,
"reward_std": 0.19135494530200958,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 666
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1232026243611865e-09,
"advantages/std": 0.548300564289093,
"advantages/var": 0.3006335087997378,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.849519743863394,
"grad_norm": 0.3015780894876757,
"learning_rate": 3.958262258193088e-07,
"loss": 0.0,
"num_tokens": 105842642.0,
"reward": 0.6875,
"reward_std": 0.1442737877368927,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 667
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.8819989581979145e-09,
"advantages/std": 0.6185727119445801,
"advantages/var": 0.38263219996247244,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 2.853788687299893,
"grad_norm": 0.4140352972264349,
"learning_rate": 3.9451908911215637e-07,
"loss": -0.0,
"num_tokens": 106019311.0,
"reward": 0.61328125,
"reward_std": 0.191616952419281,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 668
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 6.3877060235654596e-09,
"advantages/std": 0.43739765882492065,
"advantages/var": 0.1913167119455217,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 2.8580576307363925,
"grad_norm": 0.31121658514582756,
"learning_rate": 3.932127064506299e-07,
"loss": 0.0,
"num_tokens": 106166230.0,
"reward": 0.76953125,
"reward_std": 0.09666221588850021,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 669
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.0779075803624075e-09,
"advantages/std": 0.5960719585418701,
"advantages/var": 0.35530177975994093,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 2.8623265741728923,
"grad_norm": 0.23713356471530259,
"learning_rate": 3.919070871735955e-07,
"loss": -0.0,
"num_tokens": 106337094.0,
"reward": 0.69921875,
"reward_std": 0.17938891053199768,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 670
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.547875832215062e-09,
"advantages/std": 0.5482935309410095,
"advantages/var": 0.30062579607175977,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.8665955176093916,
"grad_norm": 0.2832364826132542,
"learning_rate": 3.906022406144624e-07,
"loss": 0.0,
"num_tokens": 106479557.0,
"reward": 0.76171875,
"reward_std": 0.13520357012748718,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 671
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.0646322509902576e-09,
"advantages/std": 0.4373916685581207,
"advantages/var": 0.19131147172405694,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.8708644610458913,
"grad_norm": 0.19208483130815907,
"learning_rate": 3.8929817610111633e-07,
"loss": 0.0,
"num_tokens": 106619046.0,
"reward": 0.75,
"reward_std": 0.09100693464279175,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 672
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.979314307724204e-09,
"advantages/std": 0.46759578585624695,
"advantages/var": 0.21864581895052115,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.8751334044823906,
"grad_norm": 0.23726851480513966,
"learning_rate": 3.879949029558515e-07,
"loss": -0.0,
"num_tokens": 106764635.0,
"reward": 0.78125,
"reward_std": 0.10718279331922531,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 673
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.3875618921514382e-09,
"advantages/std": 0.6185793280601501,
"advantages/var": 0.38264038510334686,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.87940234791889,
"grad_norm": 0.30002245157561425,
"learning_rate": 3.866924304953058e-07,
"loss": -0.0,
"num_tokens": 106923263.0,
"reward": 0.68359375,
"reward_std": 0.1996288150548935,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 674
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.4157574238941685e-09,
"advantages/std": 0.6816369295120239,
"advantages/var": 0.4646289036745799,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.8836712913553897,
"grad_norm": 0.36167138258359255,
"learning_rate": 3.853907680303928e-07,
"loss": 0.0,
"num_tokens": 107079263.0,
"reward": 0.77734375,
"reward_std": 0.23619185388088226,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 675
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.694508619074992e-10,
"advantages/std": 0.49596381187438965,
"advantages/var": 0.24598010268897497,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.887940234791889,
"grad_norm": 0.28628604820320996,
"learning_rate": 3.840899248662358e-07,
"loss": -0.0,
"num_tokens": 107221210.0,
"reward": 0.7578125,
"reward_std": 0.12665247917175293,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 676
},
{
"advantages/mean": -5.3551048040390015e-09,
"advantages/snr": 1.024351834685619e-08,
"advantages/std": 0.5227798223495483,
"advantages/var": 0.2732987426558253,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.8922091782283887,
"grad_norm": 0.26532878266531107,
"learning_rate": 3.827899103021016e-07,
"loss": 0.0,
"num_tokens": 107375048.0,
"reward": 0.86328125,
"reward_std": 0.1258624941110611,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"step": 677
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.258498537262613e-09,
"advantages/std": 0.43739479780197144,
"advantages/var": 0.19131420914422748,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.896478121664888,
"grad_norm": 0.20238707133078684,
"learning_rate": 3.814907336313329e-07,
"loss": 0.0,
"num_tokens": 107512425.0,
"reward": 0.80859375,
"reward_std": 0.09324727952480316,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 678
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.235152379307402e-09,
"advantages/std": 0.522782564163208,
"advantages/var": 0.2733016093930587,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.9007470651013874,
"grad_norm": 0.26416420308482647,
"learning_rate": 3.801924041412833e-07,
"loss": 0.0,
"num_tokens": 107662320.0,
"reward": 0.765625,
"reward_std": 0.12980784475803375,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 679
},
{
"advantages/mean": -6.51925802230835e-09,
"advantages/snr": 1.138353280650014e-08,
"advantages/std": 0.5726919770240784,
"advantages/var": 0.3279761005477475,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 2.9050160085378867,
"grad_norm": 0.2978908074569031,
"learning_rate": 3.788949311132497e-07,
"loss": 0.0,
"num_tokens": 107823472.0,
"reward": 0.71875,
"reward_std": 0.16846150159835815,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 680
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.493814952589826e-09,
"advantages/std": 0.4675893187522888,
"advantages/var": 0.21863977101122956,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.9092849519743864,
"grad_norm": 0.2536239393904906,
"learning_rate": 3.77598323822407e-07,
"loss": 0.0,
"num_tokens": 107976960.0,
"reward": 0.69921875,
"reward_std": 0.10205547511577606,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 681
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.855440276329792e-09,
"advantages/std": 0.4373944401741028,
"advantages/var": 0.19131389629521678,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 2.9135538954108857,
"grad_norm": 0.27394085605791346,
"learning_rate": 3.763025915377402e-07,
"loss": 0.0,
"num_tokens": 108121599.0,
"reward": 0.75,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 682
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 9.389061808844325e-09,
"advantages/std": 0.495961457490921,
"advantages/var": 0.24597776731651866,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.9178228388473855,
"grad_norm": 0.21000631620580057,
"learning_rate": 3.750077435219806e-07,
"loss": 0.0,
"num_tokens": 108270769.0,
"reward": 0.7578125,
"reward_std": 0.12164874374866486,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 683
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.246344777610733e-10,
"advantages/std": 0.5483083724975586,
"advantages/var": 0.30064207135092147,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 2.9220917822838848,
"grad_norm": 0.3039058762004899,
"learning_rate": 3.7371378903153743e-07,
"loss": 0.0,
"num_tokens": 108419518.0,
"reward": 0.75390625,
"reward_std": 0.1528160572052002,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 684
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 2.129257973375593e-09,
"advantages/std": 0.4373930096626282,
"advantages/var": 0.19131264490173194,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.926360725720384,
"grad_norm": 0.199255740500126,
"learning_rate": 3.724207373164321e-07,
"loss": 0.0,
"num_tokens": 108573111.0,
"reward": 0.67578125,
"reward_std": 0.09100939333438873,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 685
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.5968822224932214e-09,
"advantages/std": 0.43740978837013245,
"advantages/var": 0.19132732296200405,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 2.930629669156884,
"grad_norm": 0.20397965382033725,
"learning_rate": 3.7112859762023305e-07,
"loss": 0.0,
"num_tokens": 108718246.0,
"reward": 0.69140625,
"reward_std": 0.10638399422168732,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 686
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.485497578361386e-09,
"advantages/std": 0.4675987958908081,
"advantages/var": 0.21864863391853362,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 2.934898612593383,
"grad_norm": 0.27493274840434406,
"learning_rate": 3.698373791799885e-07,
"loss": -0.0,
"num_tokens": 108886492.0,
"reward": 0.65234375,
"reward_std": 0.11112816631793976,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 687
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.6985266479578181e-09,
"advantages/std": 0.5483120083808899,
"advantages/var": 0.30064605853468507,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.9391675560298824,
"grad_norm": 0.297621247232595,
"learning_rate": 3.6854709122616143e-07,
"loss": 0.0,
"num_tokens": 109066092.0,
"reward": 0.66015625,
"reward_std": 0.15676385164260864,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 688
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.6721981084975215e-09,
"advantages/std": 0.5227845311164856,
"advantages/var": 0.2733036659746837,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.943436499466382,
"grad_norm": 0.2978856950784898,
"learning_rate": 3.6725774298256286e-07,
"loss": 0.0,
"num_tokens": 109232524.0,
"reward": 0.75390625,
"reward_std": 0.13098736107349396,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 689
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.516804898616483e-09,
"advantages/std": 0.6185716986656189,
"advantages/var": 0.3826309463900692,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.9477054429028815,
"grad_norm": 0.3153950668234978,
"learning_rate": 3.6596934366628584e-07,
"loss": 0.0,
"num_tokens": 109377958.0,
"reward": 0.6484375,
"reward_std": 0.18990948796272278,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 690
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.907156417836012e-10,
"advantages/std": 0.5227945446968079,
"advantages/var": 0.27331413596474263,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 2.951974386339381,
"grad_norm": 0.23800738969758298,
"learning_rate": 3.646819024876406e-07,
"loss": 0.0,
"num_tokens": 109519667.0,
"reward": 0.67578125,
"reward_std": 0.1422979235649109,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 691
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.874775760889987e-09,
"advantages/std": 0.4049544334411621,
"advantages/var": 0.1639880931636526,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.9562433297758806,
"grad_norm": 0.21111161810104676,
"learning_rate": 3.633954286500872e-07,
"loss": -0.0,
"num_tokens": 109663347.0,
"reward": 0.76171875,
"reward_std": 0.0850832611322403,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 692
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.4393556240477054e-09,
"advantages/std": 0.5726855993270874,
"advantages/var": 0.3279687956766253,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 2.96051227321238,
"grad_norm": 0.2781378250410254,
"learning_rate": 3.621099313501711e-07,
"loss": 0.0,
"num_tokens": 109815831.0,
"reward": 0.671875,
"reward_std": 0.16162671148777008,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 693
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.0646405206460833e-09,
"advantages/std": 0.43738827109336853,
"advantages/var": 0.19130849969004604,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 2.9647812166488796,
"grad_norm": 0.21914340104481092,
"learning_rate": 3.608254197774567e-07,
"loss": 0.0,
"num_tokens": 109967825.0,
"reward": 0.66015625,
"reward_std": 0.08865037560462952,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 694
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 5.3230332244416e-09,
"advantages/std": 0.4374021887779236,
"advantages/var": 0.1913206747477183,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 2.969050160085379,
"grad_norm": 0.22185614338112256,
"learning_rate": 3.5954190311446144e-07,
"loss": 0.0,
"num_tokens": 110116126.0,
"reward": 0.73828125,
"reward_std": 0.10061003267765045,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 695
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 9.049287811310864e-09,
"advantages/std": 0.437395840883255,
"advantages/var": 0.19131512162196973,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 2.9733191035218782,
"grad_norm": 0.225793553689538,
"learning_rate": 3.582593905365912e-07,
"loss": 0.0,
"num_tokens": 110265773.0,
"reward": 0.67578125,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 696
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 2.519352591236486e-09,
"advantages/std": 0.3696674108505249,
"advantages/var": 0.13665399464493078,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 2.9775880469583775,
"grad_norm": 0.20542793794206402,
"learning_rate": 3.5697789121207295e-07,
"loss": -0.0,
"num_tokens": 110394807.0,
"reward": 0.8671875,
"reward_std": 0.0677327960729599,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"step": 697
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.131143099359766e-10,
"advantages/std": 0.5726885795593262,
"advantages/var": 0.32797220915767866,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 2.9818569903948773,
"grad_norm": 0.28886301623000166,
"learning_rate": 3.556974143018916e-07,
"loss": -0.0,
"num_tokens": 110549186.0,
"reward": 0.69921875,
"reward_std": 0.16663289070129395,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 698
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.694515671377163e-10,
"advantages/std": 0.49596306681632996,
"advantages/var": 0.24597936364585937,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 2.9861259338313766,
"grad_norm": 0.24207518599909547,
"learning_rate": 3.54417968959722e-07,
"loss": 0.0,
"num_tokens": 110707900.0,
"reward": 0.6328125,
"reward_std": 0.12388662248849869,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 699
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.129219287094493e-09,
"advantages/std": 0.6185618042945862,
"advantages/var": 0.38261870573217394,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.9903948772678763,
"grad_norm": 0.28677140760275627,
"learning_rate": 3.531395643318653e-07,
"loss": 0.0,
"num_tokens": 110869282.0,
"reward": 0.6953125,
"reward_std": 0.17794983088970184,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 700
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.75558162001709e-09,
"advantages/std": 0.49596714973449707,
"advantages/var": 0.24598341361576104,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 2.9946638207043756,
"grad_norm": 0.25190631294689303,
"learning_rate": 3.5186220955718303e-07,
"loss": -0.0,
"num_tokens": 111025469.0,
"reward": 0.6953125,
"reward_std": 0.12953945994377136,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 701
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.1175257168546057e-09,
"advantages/std": 0.5227910280227661,
"advantages/var": 0.2733104589811006,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 2.998932764140875,
"grad_norm": 0.29505803290443655,
"learning_rate": 3.505859137670313e-07,
"loss": 0.0,
"num_tokens": 111186315.0,
"reward": 0.5546875,
"reward_std": 0.13888052105903625,
"rewards/drgrpo_math_reward/mean": 0.5546875,
"rewards/drgrpo_math_reward/std": 0.49797385931015015,
"step": 702
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.258565854740685e-09,
"advantages/std": 0.4373878836631775,
"advantages/var": 0.1913081607753533,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.0042689434364993,
"grad_norm": 0.24085320145014932,
"learning_rate": 3.493106860851962e-07,
"loss": 0.0,
"num_tokens": 111307504.0,
"reward": 0.8046875,
"reward_std": 0.08811995387077332,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 703
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.6721548464238397e-09,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 3.008537886872999,
"grad_norm": 0.2488154060969329,
"learning_rate": 3.4803653562782804e-07,
"loss": -0.0,
"num_tokens": 111482548.0,
"reward": 0.62890625,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 704
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.9724216339261817e-09,
"advantages/std": 0.5483120083808899,
"advantages/var": 0.30064605853468507,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.0128068303094984,
"grad_norm": 0.2594574520332894,
"learning_rate": 3.467634715033767e-07,
"loss": 0.0,
"num_tokens": 111628416.0,
"reward": 0.76171875,
"reward_std": 0.15676386654376984,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 705
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.3971448795952543e-09,
"advantages/std": 0.5482972264289856,
"advantages/var": 0.3006298485097183,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 3.0170757737459977,
"grad_norm": 0.25315906824862405,
"learning_rate": 3.454915028125263e-07,
"loss": -0.0,
"num_tokens": 111792422.0,
"reward": 0.71484375,
"reward_std": 0.14085638523101807,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 706
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.749631495215814e-10,
"advantages/std": 0.40494880080223083,
"advantages/var": 0.16398353127116483,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.0213447171824974,
"grad_norm": 0.21177046794302226,
"learning_rate": 3.442206386481297e-07,
"loss": -0.0,
"num_tokens": 111941877.0,
"reward": 0.703125,
"reward_std": 0.08048881590366364,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 707
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.344384639658041e-09,
"advantages/std": 0.5227856636047363,
"advantages/var": 0.27330485007064453,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.0256136606189967,
"grad_norm": 0.3119065153498931,
"learning_rate": 3.429508880951444e-07,
"loss": -0.0,
"num_tokens": 112094054.0,
"reward": 0.7421875,
"reward_std": 0.13269482553005219,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 708
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.0646084583111024e-09,
"advantages/std": 0.4374014437198639,
"advantages/var": 0.19132002296822126,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.0298826040554965,
"grad_norm": 0.23739593038026452,
"learning_rate": 3.4168226023056636e-07,
"loss": 0.0,
"num_tokens": 112239948.0,
"reward": 0.81640625,
"reward_std": 0.09954920411109924,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 709
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 7.468917193812455e-09,
"advantages/std": 0.46759918332099915,
"advantages/var": 0.21864899624246537,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.034151547491996,
"grad_norm": 0.24161547263417743,
"learning_rate": 3.404147641233667e-07,
"loss": -0.0,
"num_tokens": 112390687.0,
"reward": 0.75,
"reward_std": 0.11165857315063477,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 710
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.5628885646149507e-09,
"advantages/std": 0.5227907299995422,
"advantages/var": 0.27331014737345427,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.038420490928495,
"grad_norm": 0.3094102816726311,
"learning_rate": 3.391484088344256e-07,
"loss": -0.0,
"num_tokens": 112550104.0,
"reward": 0.71484375,
"reward_std": 0.13835011422634125,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 711
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 3.2727170154969998e-09,
"advantages/std": 0.6402862668037415,
"advantages/var": 0.409966503457472,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.042689434364995,
"grad_norm": 0.3603208332066757,
"learning_rate": 3.378832034164676e-07,
"loss": -0.0,
"num_tokens": 112710103.0,
"reward": 0.63671875,
"reward_std": 0.20779281854629517,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 712
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 7.041808204615512e-09,
"advantages/std": 0.49596062302589417,
"advantages/var": 0.2459769395922331,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 3.046958377801494,
"grad_norm": 0.24706872570747945,
"learning_rate": 3.366191569139981e-07,
"loss": -0.0,
"num_tokens": 112867011.0,
"reward": 0.796875,
"reward_std": 0.12217669934034348,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 713
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.4938024801910196e-09,
"advantages/std": 0.4675932228565216,
"advantages/var": 0.21864342206134868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.0512273212379935,
"grad_norm": 0.21995529779889317,
"learning_rate": 3.3535627836323674e-07,
"loss": -0.0,
"num_tokens": 113021282.0,
"reward": 0.75,
"reward_std": 0.1054728776216507,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 714
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 3.906124938468067e-09,
"advantages/std": 0.5960655212402344,
"advantages/var": 0.3552941056113923,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.055496264674493,
"grad_norm": 0.34320023408995987,
"learning_rate": 3.3409457679205466e-07,
"loss": -0.0,
"num_tokens": 113172698.0,
"reward": 0.671875,
"reward_std": 0.17031869292259216,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 715
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196787006259086e-09,
"advantages/std": 0.572685182094574,
"advantages/var": 0.32796831779069535,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.0597652081109925,
"grad_norm": 0.2931749535923372,
"learning_rate": 3.328340612199091e-07,
"loss": -0.0,
"num_tokens": 113331565.0,
"reward": 0.70703125,
"reward_std": 0.1626850962638855,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 716
},
{
"advantages/mean": -5.587935447692871e-09,
"advantages/snr": 1.2775343281201946e-08,
"advantages/std": 0.4374000132083893,
"advantages/var": 0.19131877155469912,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.064034151547492,
"grad_norm": 0.19166950434503208,
"learning_rate": 3.3157474065777867e-07,
"loss": -0.0,
"num_tokens": 113483236.0,
"reward": 0.7890625,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 717
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.599778655449369e-09,
"advantages/std": 0.4049423336982727,
"advantages/var": 0.16397829362100325,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.0683030949839916,
"grad_norm": 0.22559544144317178,
"learning_rate": 3.3031662410809955e-07,
"loss": 0.0,
"num_tokens": 113628029.0,
"reward": 0.72265625,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 718
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 3.4496892771862277e-09,
"advantages/std": 0.4049593210220337,
"advantages/var": 0.16399205168262654,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.072572038420491,
"grad_norm": 0.20541126227691692,
"learning_rate": 3.290597205647008e-07,
"loss": 0.0,
"num_tokens": 113768903.0,
"reward": 0.73828125,
"reward_std": 0.08903107047080994,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 719
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.899019811574457e-09,
"advantages/std": 0.5227856040000916,
"advantages/var": 0.27330478774974054,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 3.07684098185699,
"grad_norm": 0.22644770218390384,
"learning_rate": 3.278040390127402e-07,
"loss": -0.0,
"num_tokens": 113930172.0,
"reward": 0.71875,
"reward_std": 0.13098980486392975,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 720
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.492689555221465e-09,
"advantages/std": 0.5483083724975586,
"advantages/var": 0.30064207135092147,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.08110992529349,
"grad_norm": 0.3173400630165274,
"learning_rate": 3.2654958842863966e-07,
"loss": 0.0,
"num_tokens": 114077777.0,
"reward": 0.73828125,
"reward_std": 0.1528160572052002,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 721
},
{
"advantages/mean": -5.122274160385132e-09,
"advantages/snr": 1.0954297384457748e-08,
"advantages/std": 0.4676040709018707,
"advantages/var": 0.21865356712400175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.0853788687299892,
"grad_norm": 0.3356049531649926,
"learning_rate": 3.252963777800217e-07,
"loss": 0.0,
"num_tokens": 114224987.0,
"reward": 0.71484375,
"reward_std": 0.11625301837921143,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 722
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.855416338439584e-09,
"advantages/std": 0.43739622831344604,
"advantages/var": 0.19131546054282822,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.089647812166489,
"grad_norm": 0.2177588528741852,
"learning_rate": 3.2404441602564505e-07,
"loss": 0.0,
"num_tokens": 114374152.0,
"reward": 0.6796875,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 723
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 5.749467742471436e-10,
"advantages/std": 0.4049603343009949,
"advantages/var": 0.16399287235717352,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.0939167556029883,
"grad_norm": 0.20650317992058692,
"learning_rate": 3.2279371211533975e-07,
"loss": -0.0,
"num_tokens": 114523324.0,
"reward": 0.80859375,
"reward_std": 0.08850310742855072,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 724
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 3.1492366929915195e-09,
"advantages/std": 0.36966201663017273,
"advantages/var": 0.1366500065390861,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.0981856990394876,
"grad_norm": 0.1744781666559507,
"learning_rate": 3.2154427498994514e-07,
"loss": 0.0,
"num_tokens": 114658106.0,
"reward": 0.8359375,
"reward_std": 0.06378498673439026,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"step": 725
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.21884497705121e-09,
"advantages/std": 0.5483039021492004,
"advantages/var": 0.30063716911203997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.1024546424759873,
"grad_norm": 0.30287765048437704,
"learning_rate": 3.2029611358124365e-07,
"loss": -0.0,
"num_tokens": 114802608.0,
"reward": 0.69921875,
"reward_std": 0.14769119024276733,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 726
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.8990304241357955e-09,
"advantages/std": 0.5227844715118408,
"advantages/var": 0.2733036036539147,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.1067235859124867,
"grad_norm": 0.24978831635545135,
"learning_rate": 3.190492368118988e-07,
"loss": 0.0,
"num_tokens": 114947279.0,
"reward": 0.80078125,
"reward_std": 0.13098734617233276,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 727
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.2186833239762965e-09,
"advantages/std": 0.5483161807060242,
"advantages/var": 0.30065063402404135,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 3.110992529348986,
"grad_norm": 0.25997521211411223,
"learning_rate": 3.1780365359539043e-07,
"loss": 0.0,
"num_tokens": 115098361.0,
"reward": 0.71875,
"reward_std": 0.1613583266735077,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 728
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 7.452348067104926e-09,
"advantages/std": 0.43739622831344604,
"advantages/var": 0.19131546054282822,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.1152614727854857,
"grad_norm": 0.27132323916482515,
"learning_rate": 3.1655937283595113e-07,
"loss": 0.0,
"num_tokens": 115227478.0,
"reward": 0.8828125,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"step": 729
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 4.878690429522677e-09,
"advantages/std": 0.5726880431175232,
"advantages/var": 0.3279715947297781,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.119530416221985,
"grad_norm": 0.3134701671286318,
"learning_rate": 3.153164034285031e-07,
"loss": -0.0,
"num_tokens": 115370940.0,
"reward": 0.76171875,
"reward_std": 0.16557206213474274,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 730
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 3.1938662863450107e-09,
"advantages/std": 0.437395840883255,
"advantages/var": 0.19131512162196973,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.1237993596584843,
"grad_norm": 0.23771120017791628,
"learning_rate": 3.1407475425859343e-07,
"loss": 0.0,
"num_tokens": 115526803.0,
"reward": 0.76171875,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 731
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.2596721319098194e-09,
"advantages/std": 0.3696686327457428,
"advantages/var": 0.13665489803610686,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.128068303094984,
"grad_norm": 0.20839087665717526,
"learning_rate": 3.128344342023319e-07,
"loss": -0.0,
"num_tokens": 115665824.0,
"reward": 0.8203125,
"reward_std": 0.06890984624624252,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 732
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.9724655787857923e-09,
"advantages/std": 0.5483039021492004,
"advantages/var": 0.30063716911203997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.1323372465314834,
"grad_norm": 0.2824516520486795,
"learning_rate": 3.1159545212632695e-07,
"loss": 0.0,
"num_tokens": 115838105.0,
"reward": 0.72265625,
"reward_std": 0.14769119024276733,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 733
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 8.131212494768329e-10,
"advantages/std": 0.5726836919784546,
"advantages/var": 0.32796661105807345,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.136606189967983,
"grad_norm": 0.3025831327566476,
"learning_rate": 3.1035781688762176e-07,
"loss": 0.0,
"num_tokens": 115996428.0,
"reward": 0.67578125,
"reward_std": 0.15874217450618744,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 734
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.528048060890719e-10,
"advantages/std": 0.6185684204101562,
"advantages/var": 0.3826268907287158,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.1408751334044824,
"grad_norm": 0.3102665039315969,
"learning_rate": 3.09121537333632e-07,
"loss": -0.0,
"num_tokens": 116142635.0,
"reward": 0.78125,
"reward_std": 0.18596167862415314,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 735
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.492925903541175e-10,
"advantages/std": 0.5482931137084961,
"advantages/var": 0.3006253385401578,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.1451440768409817,
"grad_norm": 0.26113775100574044,
"learning_rate": 3.0788662230208145e-07,
"loss": -0.0,
"num_tokens": 116289720.0,
"reward": 0.75,
"reward_std": 0.1362619400024414,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 736
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 5.749545597910266e-10,
"advantages/std": 0.40495485067367554,
"advantages/var": 0.16398843108413885,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 3.1494130202774815,
"grad_norm": 0.2541314504440522,
"learning_rate": 3.0665308062094017e-07,
"loss": 0.0,
"num_tokens": 116418280.0,
"reward": 0.828125,
"reward_std": 0.0856136754155159,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 737
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.8459080821278322e-09,
"advantages/std": 0.5726869702339172,
"advantages/var": 0.3279703658757036,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 3.153681963713981,
"grad_norm": 0.2974488892903314,
"learning_rate": 3.054209211083599e-07,
"loss": -0.0,
"num_tokens": 116585887.0,
"reward": 0.59375,
"reward_std": 0.1638646125793457,
"rewards/drgrpo_math_reward/mean": 0.59375,
"rewards/drgrpo_math_reward/std": 0.49209436774253845,
"step": 738
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.323004209884659e-10,
"advantages/std": 0.4374045729637146,
"advantages/var": 0.19132276044956953,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 3.15795090715048,
"grad_norm": 0.20160205677745852,
"learning_rate": 3.0419015257261195e-07,
"loss": 0.0,
"num_tokens": 116749373.0,
"reward": 0.6484375,
"reward_std": 0.10178954154253006,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 739
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.5624240399785312e-09,
"advantages/std": 0.5960754156112671,
"advantages/var": 0.3553059010961448,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 3.16221985058698,
"grad_norm": 0.2871646775874437,
"learning_rate": 3.029607838120246e-07,
"loss": -0.0,
"num_tokens": 116912138.0,
"reward": 0.65234375,
"reward_std": 0.1817479282617569,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 740
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.0646167275973057e-09,
"advantages/std": 0.4373980462551117,
"advantages/var": 0.19131705086778883,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.166488794023479,
"grad_norm": 0.24069845260442374,
"learning_rate": 3.017328236149186e-07,
"loss": 0.0,
"num_tokens": 117058546.0,
"reward": 0.7578125,
"reward_std": 0.09719263762235641,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 741
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.562928780215032e-09,
"advantages/std": 0.5227848291397095,
"advantages/var": 0.2733039775786352,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 3.1707577374599785,
"grad_norm": 0.24511272617353355,
"learning_rate": 3.005062807595464e-07,
"loss": -0.0,
"num_tokens": 117227763.0,
"reward": 0.5625,
"reward_std": 0.13151776790618896,
"rewards/drgrpo_math_reward/mean": 0.5625,
"rewards/drgrpo_math_reward/std": 0.49705013632774353,
"step": 742
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.4959576427936554,
"advantages/var": 0.24597398344543908,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.175026680896478,
"grad_norm": 0.28007502980297416,
"learning_rate": 2.9928116401402745e-07,
"loss": 0.0,
"num_tokens": 117382433.0,
"reward": 0.69140625,
"reward_std": 0.11982014030218124,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 743
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.098285603281836e-09,
"advantages/std": 0.5726953148841858,
"advantages/var": 0.3279799236902967,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 3.1792956243329775,
"grad_norm": 0.22358854618839655,
"learning_rate": 2.980574821362872e-07,
"loss": -0.0,
"num_tokens": 117555197.0,
"reward": 0.65625,
"reward_std": 0.17399811744689941,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 744
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.3472014185425037e-09,
"advantages/std": 0.49597498774528503,
"advantages/var": 0.24599118846893564,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.1835645677694773,
"grad_norm": 0.2605663955901431,
"learning_rate": 2.9683524387399353e-07,
"loss": -0.0,
"num_tokens": 117697821.0,
"reward": 0.79296875,
"reward_std": 0.13531586527824402,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 745
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 7.81241861977572e-09,
"advantages/std": 0.5960526466369629,
"advantages/var": 0.35527875756292815,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.1878335112059766,
"grad_norm": 0.32213306719526563,
"learning_rate": 2.9561445796449414e-07,
"loss": 0.0,
"num_tokens": 117847690.0,
"reward": 0.734375,
"reward_std": 0.15388324856758118,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 746
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.511355967409318e-09,
"advantages/std": 0.4959544241428375,
"advantages/var": 0.24597079082685358,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.192102454642476,
"grad_norm": 0.33015216030836414,
"learning_rate": 2.943951331347546e-07,
"loss": -0.0,
"num_tokens": 118006200.0,
"reward": 0.76953125,
"reward_std": 0.11534436047077179,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 747
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 5.3443054276452664e-09,
"advantages/std": 0.5227934122085571,
"advantages/var": 0.27331295184866633,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.1963713980789756,
"grad_norm": 0.272846026500217,
"learning_rate": 2.931772781012958e-07,
"loss": -0.0,
"num_tokens": 118145012.0,
"reward": 0.65625,
"reward_std": 0.1422954797744751,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 748
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983451446179363e-09,
"advantages/std": 0.46759578585624695,
"advantages/var": 0.21864581895052115,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.200640341515475,
"grad_norm": 0.2411495898833993,
"learning_rate": 2.9196090157013143e-07,
"loss": 0.0,
"num_tokens": 118292956.0,
"reward": 0.640625,
"reward_std": 0.10718280076980591,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 749
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.694635563755988e-10,
"advantages/std": 0.4959504008293152,
"advantages/var": 0.2459668000827584,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 3.2049092849519742,
"grad_norm": 0.282352941450176,
"learning_rate": 2.9074601223670613e-07,
"loss": 0.0,
"num_tokens": 118459099.0,
"reward": 0.70703125,
"reward_std": 0.11139655113220215,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 750
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.794128748648032e-09,
"advantages/std": 0.5483102202415466,
"advantages/var": 0.3006440976213334,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.209178228388474,
"grad_norm": 0.277899846738354,
"learning_rate": 2.895326187858326e-07,
"loss": 0.0,
"num_tokens": 118591536.0,
"reward": 0.7734375,
"reward_std": 0.1539955735206604,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 751
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.694570397962185e-10,
"advantages/std": 0.49595728516578674,
"advantages/var": 0.2459736287090175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.2134471718249733,
"grad_norm": 0.2643694557106249,
"learning_rate": 2.883207298916304e-07,
"loss": 0.0,
"num_tokens": 118744790.0,
"reward": 0.7421875,
"reward_std": 0.11928972601890564,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 752
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 5.818215037122586e-09,
"advantages/std": 0.6402806043624878,
"advantages/var": 0.4099592523227926,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.2177161152614726,
"grad_norm": 0.3697522220779177,
"learning_rate": 2.8711035421746363e-07,
"loss": 0.0,
"num_tokens": 118901732.0,
"reward": 0.74609375,
"reward_std": 0.2009580433368683,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 753
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.5628515995610897e-09,
"advantages/std": 0.5227961540222168,
"advantages/var": 0.27331581866042143,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.2219850586979724,
"grad_norm": 0.26758234939193326,
"learning_rate": 2.8590150041587886e-07,
"loss": -0.0,
"num_tokens": 119045722.0,
"reward": 0.67578125,
"reward_std": 0.1429470181465149,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 754
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.661586974765028e-09,
"advantages/std": 0.43739062547683716,
"advantages/var": 0.19131055925501883,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.2262540021344717,
"grad_norm": 0.28207170515823043,
"learning_rate": 2.846941771285428e-07,
"loss": 0.0,
"num_tokens": 119187816.0,
"reward": 0.84375,
"reward_std": 0.08982987701892853,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"step": 755
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 6.9115327786001205e-09,
"advantages/std": 0.5726835131645203,
"advantages/var": 0.32796640625045725,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.2305229455709714,
"grad_norm": 0.36454511361568076,
"learning_rate": 2.8348839298618177e-07,
"loss": 0.0,
"num_tokens": 119341187.0,
"reward": 0.7734375,
"reward_std": 0.15991678833961487,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 756
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.131367227291202e-10,
"advantages/std": 0.28633639216423035,
"advantages/var": 0.08198852947762791,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 3.2347918890074707,
"grad_norm": 0.14542160724387604,
"learning_rate": 2.8228415660851916e-07,
"loss": -0.0,
"num_tokens": 119482536.0,
"reward": 0.8125,
"reward_std": 0.036563023924827576,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"step": 757
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.389108072544253e-10,
"advantages/std": 0.49595901370048523,
"advantages/var": 0.2459753432707581,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.23906083244397,
"grad_norm": 0.2659098513284101,
"learning_rate": 2.810814766042132e-07,
"loss": -0.0,
"num_tokens": 119648386.0,
"reward": 0.7109375,
"reward_std": 0.11993882060050964,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 758
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.7556100546892774e-09,
"advantages/std": 0.4959633946418762,
"advantages/var": 0.24597968882469345,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.24332977588047,
"grad_norm": 0.22029640670047446,
"learning_rate": 2.7988036157079753e-07,
"loss": 0.0,
"num_tokens": 119801114.0,
"reward": 0.80078125,
"reward_std": 0.12441704422235489,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 759
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.975227058317357e-09,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.247598719316969,
"grad_norm": 0.2659706965042973,
"learning_rate": 2.78680820094617e-07,
"loss": 0.0,
"num_tokens": 119936957.0,
"reward": 0.79296875,
"reward_std": 0.10376541316509247,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 760
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.6590475268052982e-09,
"advantages/std": 0.5726833939552307,
"advantages/var": 0.327966269712082,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.2518676627534684,
"grad_norm": 0.32066245051467945,
"learning_rate": 2.774828607507683e-07,
"loss": 0.0,
"num_tokens": 120083359.0,
"reward": 0.71875,
"reward_std": 0.15821176767349243,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 761
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.450217068554151e-09,
"advantages/std": 0.4959578514099121,
"advantages/var": 0.24597419037513646,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.256136606189968,
"grad_norm": 0.2510266146709993,
"learning_rate": 2.7628649210303836e-07,
"loss": 0.0,
"num_tokens": 120232789.0,
"reward": 0.77734375,
"reward_std": 0.11993636190891266,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 762
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.369569097398126e-09,
"advantages/std": 0.5483039021492004,
"advantages/var": 0.30063716911203997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.2604055496264674,
"grad_norm": 0.28712600517511083,
"learning_rate": 2.750917227038418e-07,
"loss": -0.0,
"num_tokens": 120380304.0,
"reward": 0.72265625,
"reward_std": 0.14769119024276733,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 763
},
{
"advantages/mean": 3.4924596548080444e-09,
"advantages/snr": 6.6803871160595985e-09,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.2646744930629668,
"grad_norm": 0.29156871329647277,
"learning_rate": 2.7389856109416175e-07,
"loss": -0.0,
"num_tokens": 120542612.0,
"reward": 0.57421875,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 764
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.749631495215814e-10,
"advantages/std": 0.40494880080223083,
"advantages/var": 0.16398353127116483,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.2689434364994665,
"grad_norm": 0.16744214090624027,
"learning_rate": 2.7270701580348734e-07,
"loss": -0.0,
"num_tokens": 120679485.0,
"reward": 0.765625,
"reward_std": 0.08048880845308304,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 765
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.0646255047872349e-09,
"advantages/std": 0.4373944401741028,
"advantages/var": 0.19131389629521678,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 3.273212379935966,
"grad_norm": 0.2318518047895217,
"learning_rate": 2.715170953497532e-07,
"loss": 0.0,
"num_tokens": 120831937.0,
"reward": 0.6796875,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 766
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.218814372272699e-09,
"advantages/std": 0.5483062267303467,
"advantages/var": 0.30063971827127034,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.2774813233724656,
"grad_norm": 0.328974605528526,
"learning_rate": 2.7032880823927906e-07,
"loss": 0.0,
"num_tokens": 120981368.0,
"reward": 0.72265625,
"reward_std": 0.1511061191558838,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 767
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.878730035740397e-09,
"advantages/std": 0.5726833939552307,
"advantages/var": 0.327966269712082,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.281750266808965,
"grad_norm": 0.42304270998951904,
"learning_rate": 2.691421629667076e-07,
"loss": 0.0,
"num_tokens": 121125897.0,
"reward": 0.609375,
"reward_std": 0.15821176767349243,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 768
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 7.041734719091418e-10,
"advantages/std": 0.3306438624858856,
"advantages/var": 0.10932536379958524,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.286019210245464,
"grad_norm": 0.1430120963726623,
"learning_rate": 2.6795716801494534e-07,
"loss": 0.0,
"num_tokens": 121266307.0,
"reward": 0.8125,
"reward_std": 0.056153833866119385,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"step": 769
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 5.323008924728638e-09,
"advantages/std": 0.43740418553352356,
"advantages/var": 0.1913224215222451,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 3.2902881536819635,
"grad_norm": 0.22800294676904456,
"learning_rate": 2.667738318551005e-07,
"loss": 0.0,
"num_tokens": 121416178.0,
"reward": 0.70703125,
"reward_std": 0.10125912725925446,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 770
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.966747009302438e-09,
"advantages/std": 0.46760493516921997,
"advantages/var": 0.2186543753946104,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.2945570971184632,
"grad_norm": 0.25481125832933416,
"learning_rate": 2.655921629464245e-07,
"loss": 0.0,
"num_tokens": 121565746.0,
"reward": 0.71484375,
"reward_std": 0.1157250627875328,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 771
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.572310092063941e-09,
"advantages/std": 0.4959639608860016,
"advantages/var": 0.2459802504977313,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.2988260405549625,
"grad_norm": 0.28682597279632915,
"learning_rate": 2.644121697362485e-07,
"loss": 0.0,
"num_tokens": 121704270.0,
"reward": 0.8046875,
"reward_std": 0.1250636875629425,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 772
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.6985414192492779e-09,
"advantages/std": 0.5483072400093079,
"advantages/var": 0.30064082944662474,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 3.3030949839914623,
"grad_norm": 0.27403367824418345,
"learning_rate": 2.6323386065992594e-07,
"loss": -0.0,
"num_tokens": 121880668.0,
"reward": 0.640625,
"reward_std": 0.15110857784748077,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 773
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.3073639274279616,
"grad_norm": 0.2682281308298368,
"learning_rate": 2.6205724414077064e-07,
"loss": 0.0,
"num_tokens": 122029949.0,
"reward": 0.73828125,
"reward_std": 0.10376539826393127,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 774
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755680466206912e-09,
"advantages/std": 0.49595409631729126,
"advantages/var": 0.24597046565390102,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.311632870864461,
"grad_norm": 0.2743161534441316,
"learning_rate": 2.608823285899964e-07,
"loss": 0.0,
"num_tokens": 122181927.0,
"reward": 0.7265625,
"reward_std": 0.11481395363807678,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 775
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 6.102934182783377e-09,
"advantages/std": 0.4959578812122345,
"advantages/var": 0.2459742199365289,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 3.3159018143009606,
"grad_norm": 0.2027708527903687,
"learning_rate": 2.597091224066581e-07,
"loss": 0.0,
"num_tokens": 122343058.0,
"reward": 0.72265625,
"reward_std": 0.11993636190891266,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 776
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 5.3230332244416e-09,
"advantages/std": 0.4374021887779236,
"advantages/var": 0.1913206747477183,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 3.32017075773746,
"grad_norm": 0.21503882009414887,
"learning_rate": 2.5853763397759077e-07,
"loss": -0.0,
"num_tokens": 122500411.0,
"reward": 0.70703125,
"reward_std": 0.10061003267765045,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 777
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.9917184873602147e-09,
"advantages/std": 0.46759748458862305,
"advantages/var": 0.21864740759360757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.3244397011739593,
"grad_norm": 0.2098579693727438,
"learning_rate": 2.573678716773496e-07,
"loss": 0.0,
"num_tokens": 122650780.0,
"reward": 0.75,
"reward_std": 0.10942068696022034,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 778
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.7248786584154183e-09,
"advantages/std": 0.4049513339996338,
"advantages/var": 0.16398558290808296,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.328708644610459,
"grad_norm": 0.21299236537132937,
"learning_rate": 2.561998438681507e-07,
"loss": 0.0,
"num_tokens": 122800404.0,
"reward": 0.76953125,
"reward_std": 0.08166831731796265,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 779
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.694570397962185e-09,
"advantages/std": 0.49595728516578674,
"advantages/var": 0.2459736287090175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.3329775880469583,
"grad_norm": 0.3391401964898629,
"learning_rate": 2.5503355889981026e-07,
"loss": 0.0,
"num_tokens": 122946483.0,
"reward": 0.6640625,
"reward_std": 0.11928971856832504,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 780
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.226802052403756e-09,
"advantages/std": 0.5227915048599243,
"advantages/var": 0.27331095755370427,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 3.3372465314834576,
"grad_norm": 0.24102544569783185,
"learning_rate": 2.538690251096862e-07,
"loss": -0.0,
"num_tokens": 123100248.0,
"reward": 0.71484375,
"reward_std": 0.13782215118408203,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 781
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.7248446385931138e-09,
"advantages/std": 0.4049593210220337,
"advantages/var": 0.16399205168262654,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 3.3415154749199574,
"grad_norm": 0.20175673231542038,
"learning_rate": 2.5270625082261753e-07,
"loss": 0.0,
"num_tokens": 123242511.0,
"reward": 0.76171875,
"reward_std": 0.08903107047080994,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 782
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.726138489896817e-09,
"advantages/std": 0.4374004006385803,
"advantages/var": 0.19131911047879058,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.3457844183564567,
"grad_norm": 0.2967863305336702,
"learning_rate": 2.5154524435086535e-07,
"loss": 0.0,
"num_tokens": 123380139.0,
"reward": 0.74609375,
"reward_std": 0.09837214648723602,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 783
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.1125084199436845e-09,
"advantages/std": 0.33064574003219604,
"advantages/var": 0.10932660540143857,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.3500533617929564,
"grad_norm": 0.23352820446542386,
"learning_rate": 2.5038601399405335e-07,
"loss": -0.0,
"num_tokens": 123518056.0,
"reward": 0.80859375,
"reward_std": 0.05786130577325821,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 784
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755677983704659e-09,
"advantages/std": 0.4959544241428375,
"advantages/var": 0.24597079082685358,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.3543223052294557,
"grad_norm": 0.23057733809022715,
"learning_rate": 2.4922856803910784e-07,
"loss": 0.0,
"num_tokens": 123653611.0,
"reward": 0.77734375,
"reward_std": 0.11534436047077179,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 785
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1232070097423174e-09,
"advantages/std": 0.5482994318008423,
"advantages/var": 0.3006322669131265,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.358591248665955,
"grad_norm": 0.28881729463753514,
"learning_rate": 2.480729147601999e-07,
"loss": -0.0,
"num_tokens": 123798879.0,
"reward": 0.72265625,
"reward_std": 0.14256630837917328,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 786
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.983395592005843e-09,
"advantages/std": 0.46760234236717224,
"advantages/var": 0.21865195058726616,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.362860192102455,
"grad_norm": 0.2043066381743843,
"learning_rate": 2.469190624186847e-07,
"loss": -0.0,
"num_tokens": 123962329.0,
"reward": 0.58203125,
"reward_std": 0.114015132188797,
"rewards/drgrpo_math_reward/mean": 0.58203125,
"rewards/drgrpo_math_reward/std": 0.49419113993644714,
"step": 787
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.8778354937898005e-09,
"advantages/std": 0.49595534801483154,
"advantages/var": 0.24597170722451267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.367129135538954,
"grad_norm": 0.2664637882228327,
"learning_rate": 2.4576701926304354e-07,
"loss": 0.0,
"num_tokens": 124110579.0,
"reward": 0.70703125,
"reward_std": 0.11652141809463501,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 788
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.975093007190219e-09,
"advantages/std": 0.46760237216949463,
"advantages/var": 0.21865197845853857,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.3713980789754534,
"grad_norm": 0.238654574305711,
"learning_rate": 2.446167935288244e-07,
"loss": 0.0,
"num_tokens": 124262473.0,
"reward": 0.66015625,
"reward_std": 0.11401514708995819,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 789
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 4.898855043494538e-09,
"advantages/std": 0.5228031873703003,
"advantages/var": 0.2733231727245453,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 3.375667022411953,
"grad_norm": 0.24661389558231522,
"learning_rate": 2.434683934385833e-07,
"loss": 0.0,
"num_tokens": 124424840.0,
"reward": 0.62890625,
"reward_std": 0.14978180825710297,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 790
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.9724474836862365e-09,
"advantages/std": 0.5483072400093079,
"advantages/var": 0.30064082944662474,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 3.3799359658484525,
"grad_norm": 0.27253745908815463,
"learning_rate": 2.423218272018252e-07,
"loss": 0.0,
"num_tokens": 124591001.0,
"reward": 0.6328125,
"reward_std": 0.15110859274864197,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 791
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 6.91155579785241e-09,
"advantages/std": 0.5726816058158875,
"advantages/var": 0.3279642216398635,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.3842049092849518,
"grad_norm": 0.25802938171721934,
"learning_rate": 2.411771030149453e-07,
"loss": 0.0,
"num_tokens": 124751630.0,
"reward": 0.73046875,
"reward_std": 0.15703225135803223,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 792
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.816686215012817e-09,
"advantages/std": 0.49596714973449707,
"advantages/var": 0.24598341361576104,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.3884738527214515,
"grad_norm": 0.27168022769566497,
"learning_rate": 2.400342290611709e-07,
"loss": -0.0,
"num_tokens": 124900592.0,
"reward": 0.7109375,
"reward_std": 0.12953945994377136,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 793
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.962714342249698e-09,
"advantages/std": 0.46759846806526184,
"advantages/var": 0.2186483273369797,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.392742796157951,
"grad_norm": 0.2667009318967389,
"learning_rate": 2.3889321351050284e-07,
"loss": 0.0,
"num_tokens": 125039515.0,
"reward": 0.8671875,
"reward_std": 0.11059774458408356,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"step": 794
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.987538125611118e-09,
"advantages/std": 0.4676036834716797,
"advantages/var": 0.2186532047962828,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.3970117395944506,
"grad_norm": 0.23550138678115673,
"learning_rate": 2.3775406451965645e-07,
"loss": -0.0,
"num_tokens": 125195086.0,
"reward": 0.703125,
"reward_std": 0.11572261154651642,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 795
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.878660979162264e-09,
"advantages/std": 0.5726915001869202,
"advantages/var": 0.3279755543863452,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.40128068303095,
"grad_norm": 0.2735380653658283,
"learning_rate": 2.3661679023200422e-07,
"loss": -0.0,
"num_tokens": 125345445.0,
"reward": 0.69140625,
"reward_std": 0.16951988637447357,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 796
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 8.464761490163722e-09,
"advantages/std": 0.4675998091697693,
"advantages/var": 0.21864958153560465,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 3.405549626467449,
"grad_norm": 0.23813199755629091,
"learning_rate": 2.3548139877751627e-07,
"loss": 0.0,
"num_tokens": 125493833.0,
"reward": 0.73828125,
"reward_std": 0.11230521649122238,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 797
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.1125556422290086e-09,
"advantages/std": 0.3306383490562439,
"advantages/var": 0.10932171786663858,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.409818569903949,
"grad_norm": 0.12153519634202017,
"learning_rate": 2.343478982727039e-07,
"loss": 0.0,
"num_tokens": 125645869.0,
"reward": 0.66015625,
"reward_std": 0.05273643881082535,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 798
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.3696071806564656e-09,
"advantages/std": 0.5483006238937378,
"advantages/var": 0.3006335741622621,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.4140875133404482,
"grad_norm": 0.30323601151678475,
"learning_rate": 2.332162968205598e-07,
"loss": 0.0,
"num_tokens": 125793341.0,
"reward": 0.75,
"reward_std": 0.1442737877368927,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 799
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.4497956537828906e-09,
"advantages/std": 0.40494683384895325,
"advantages/var": 0.16398193824429175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.4183564567769475,
"grad_norm": 0.19049755713492447,
"learning_rate": 2.3208660251050156e-07,
"loss": 0.0,
"num_tokens": 125927288.0,
"reward": 0.828125,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 800
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.4855000219859937e-09,
"advantages/std": 0.46759846806526184,
"advantages/var": 0.2186483273369797,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.4226254002134473,
"grad_norm": 0.27154904727883944,
"learning_rate": 2.309588234183137e-07,
"loss": -0.0,
"num_tokens": 126065545.0,
"reward": 0.7734375,
"reward_std": 0.11059774458408356,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 801
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5726763010025024,
"advantages/var": 0.3279581457299088,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.4268943436499466,
"grad_norm": 0.2711538515302693,
"learning_rate": 2.298329676060884e-07,
"loss": 0.0,
"num_tokens": 126232628.0,
"reward": 0.609375,
"reward_std": 0.15190494060516357,
"rewards/drgrpo_math_reward/mean": 0.609375,
"rewards/drgrpo_math_reward/std": 0.48884621262550354,
"step": 802
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.816749178431964e-09,
"advantages/std": 0.49595606327056885,
"advantages/var": 0.2459724166948405,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.431163287086446,
"grad_norm": 0.19758971646506324,
"learning_rate": 2.2870904312217003e-07,
"loss": -0.0,
"num_tokens": 126384634.0,
"reward": 0.85546875,
"reward_std": 0.11758224666118622,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"step": 803
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.225105741504331e-09,
"advantages/std": 0.4959581792354584,
"advantages/var": 0.24597451555055105,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.4354322305229457,
"grad_norm": 0.2558787584019585,
"learning_rate": 2.2758705800109578e-07,
"loss": -0.0,
"num_tokens": 126536339.0,
"reward": 0.6171875,
"reward_std": 0.12046678364276886,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 804
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 4.225068420259267e-09,
"advantages/std": 0.49596256017684937,
"advantages/var": 0.24597886109717493,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.439701173959445,
"grad_norm": 0.2466944973330916,
"learning_rate": 2.264670202635396e-07,
"loss": -0.0,
"num_tokens": 126686026.0,
"reward": 0.64453125,
"reward_std": 0.1249450072646141,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 805
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 3.726115133006607e-09,
"advantages/std": 0.43740314245224,
"advantages/var": 0.19132150902709455,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.4439701173959447,
"grad_norm": 0.31112164118570995,
"learning_rate": 2.2534893791625404e-07,
"loss": 0.0,
"num_tokens": 126831629.0,
"reward": 0.70703125,
"reward_std": 0.10008206963539124,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 806
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.9917589827329205e-09,
"advantages/std": 0.46758797764778137,
"advantages/var": 0.2186385168407421,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.448239060832444,
"grad_norm": 0.2238639116740361,
"learning_rate": 2.2423281895201336e-07,
"loss": 0.0,
"num_tokens": 126966573.0,
"reward": 0.7734375,
"reward_std": 0.10034801065921783,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 807
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2196840322654347e-09,
"advantages/std": 0.5726826786994934,
"advantages/var": 0.3279654504824272,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.4525080042689433,
"grad_norm": 0.3077283178476076,
"learning_rate": 2.2311867134955636e-07,
"loss": -0.0,
"num_tokens": 127125366.0,
"reward": 0.671875,
"reward_std": 0.15873971581459045,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 808
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.907367650833425e-10,
"advantages/std": 0.5227821469306946,
"advantages/var": 0.27330117314946634,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.456776947705443,
"grad_norm": 0.30284735739903507,
"learning_rate": 2.220065030735288e-07,
"loss": -0.0,
"num_tokens": 127277936.0,
"reward": 0.74609375,
"reward_std": 0.1275724172592163,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 809
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.6985611762532253e-09,
"advantages/std": 0.5483008623123169,
"advantages/var": 0.3006338356124303,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.4610458911419424,
"grad_norm": 0.2639969304678607,
"learning_rate": 2.208963220744276e-07,
"loss": -0.0,
"num_tokens": 127445671.0,
"reward": 0.67578125,
"reward_std": 0.1448042094707489,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 810
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.6721447927622405e-09,
"advantages/std": 0.5227949619293213,
"advantages/var": 0.2733145722186805,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 3.4653148345784417,
"grad_norm": 0.24547420920224383,
"learning_rate": 2.197881362885426e-07,
"loss": 0.0,
"num_tokens": 127598673.0,
"reward": 0.7890625,
"reward_std": 0.14123953878879547,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 811
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.51130496365215e-09,
"advantages/std": 0.49595779180526733,
"advantages/var": 0.2459741312523569,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.4695837780149414,
"grad_norm": 0.24518690215315847,
"learning_rate": 2.1868195363790143e-07,
"loss": 0.0,
"num_tokens": 127760070.0,
"reward": 0.66796875,
"reward_std": 0.11823134124279022,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 812
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.5969151899982774e-09,
"advantages/std": 0.437400758266449,
"advantages/var": 0.19131942333206453,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.4738527214514408,
"grad_norm": 0.2734723366755448,
"learning_rate": 2.175777820302116e-07,
"loss": 0.0,
"num_tokens": 127916569.0,
"reward": 0.6640625,
"reward_std": 0.09890256822109222,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 813
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.5969535994024202e-09,
"advantages/std": 0.4373902380466461,
"advantages/var": 0.19131022033850176,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 3.47812166488794,
"grad_norm": 0.21099785350170763,
"learning_rate": 2.1647562935880405e-07,
"loss": 0.0,
"num_tokens": 128064583.0,
"reward": 0.64453125,
"reward_std": 0.08929946273565292,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 814
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.816693323644983e-09,
"advantages/std": 0.4959658980369568,
"advantages/var": 0.24598217201560502,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 3.48239060832444,
"grad_norm": 0.2667089119009988,
"learning_rate": 2.1537550350257766e-07,
"loss": 0.0,
"num_tokens": 128222698.0,
"reward": 0.68359375,
"reward_std": 0.12783199548721313,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 815
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.235013757668656e-09,
"advantages/std": 0.5227941870689392,
"advantages/var": 0.273313762033073,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 3.486659551760939,
"grad_norm": 0.25729948182946266,
"learning_rate": 2.1427741232594182e-07,
"loss": 0.0,
"num_tokens": 128365820.0,
"reward": 0.796875,
"reward_std": 0.1417675018310547,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 816
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.599693686636838e-09,
"advantages/std": 0.404949814081192,
"advantages/var": 0.16398435192439198,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.609375,
"epoch": 3.490928495197439,
"grad_norm": 0.22707173594327354,
"learning_rate": 2.1318136367876093e-07,
"loss": 0.0,
"num_tokens": 128510234.0,
"reward": 0.6640625,
"reward_std": 0.07996084541082382,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 817
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.408892626470716e-09,
"advantages/std": 0.369665265083313,
"advantages/var": 0.13665240820911606,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.495197438633938,
"grad_norm": 0.22414899065228167,
"learning_rate": 2.120873653962983e-07,
"loss": 0.0,
"num_tokens": 128669693.0,
"reward": 0.6171875,
"reward_std": 0.06549490243196487,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 818
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.0646075153304846e-09,
"advantages/std": 0.43740183115005493,
"advantages/var": 0.19132036189342116,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.4994663820704375,
"grad_norm": 0.2400189816440304,
"learning_rate": 2.109954252991595e-07,
"loss": 0.0,
"num_tokens": 128818258.0,
"reward": 0.734375,
"reward_std": 0.10007961094379425,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 819
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.596904635896381e-09,
"advantages/std": 0.4374036490917206,
"advantages/var": 0.19132195223875303,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.503735325506937,
"grad_norm": 0.24899442368975186,
"learning_rate": 2.0990555119323732e-07,
"loss": -0.0,
"num_tokens": 128965008.0,
"reward": 0.7890625,
"reward_std": 0.10231749713420868,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 820
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 8.187712417192517e-09,
"advantages/std": 0.3696756958961487,
"advantages/var": 0.1366601201363018,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.5080042689434365,
"grad_norm": 0.25818591987177536,
"learning_rate": 2.0881775086965492e-07,
"loss": 0.0,
"num_tokens": 129111286.0,
"reward": 0.76171875,
"reward_std": 0.07456512749195099,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 821
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.2268665404887286e-09,
"advantages/std": 0.5227763652801514,
"advantages/var": 0.27329512809552625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 3.512273212379936,
"grad_norm": 0.3228763186681196,
"learning_rate": 2.0773203210471112e-07,
"loss": -0.0,
"num_tokens": 129254887.0,
"reward": 0.8046875,
"reward_std": 0.12244509905576706,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 822
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.944862008824502e-09,
"advantages/std": 0.5483102798461914,
"advantages/var": 0.30064416298500873,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.5165421558164356,
"grad_norm": 0.31636997771256936,
"learning_rate": 2.0664840265982452e-07,
"loss": 0.0,
"num_tokens": 129417478.0,
"reward": 0.6875,
"reward_std": 0.1539955586194992,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 823
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.246396939612599e-09,
"advantages/std": 0.548301637172699,
"advantages/var": 0.30063468532626203,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.520811099252935,
"grad_norm": 0.2597116754710221,
"learning_rate": 2.0556687028147763e-07,
"loss": 0.0,
"num_tokens": 129566189.0,
"reward": 0.71484375,
"reward_std": 0.14427624642848969,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 824
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.7249168688457406e-09,
"advantages/std": 0.4049423635005951,
"advantages/var": 0.1639783177574481,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.525080042689434,
"grad_norm": 0.19985575215620124,
"learning_rate": 2.0448744270116203e-07,
"loss": 0.0,
"num_tokens": 129723689.0,
"reward": 0.75390625,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 825
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.6985526825316541e-09,
"advantages/std": 0.5483036041259766,
"advantages/var": 0.3006368422975356,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.529348986125934,
"grad_norm": 0.2561307664630002,
"learning_rate": 2.0341012763532239e-07,
"loss": -0.0,
"num_tokens": 129876823.0,
"reward": 0.703125,
"reward_std": 0.14716076850891113,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 826
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.4855557819877025e-09,
"advantages/std": 0.46759098768234253,
"advantages/var": 0.2186413317617486,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.5336179295624333,
"grad_norm": 0.2774758277053324,
"learning_rate": 2.0233493278530244e-07,
"loss": 0.0,
"num_tokens": 130020789.0,
"reward": 0.73828125,
"reward_std": 0.10429336130619049,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 827
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.691728490686944e-09,
"advantages/std": 0.572695791721344,
"advantages/var": 0.327980469855337,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.537886872998933,
"grad_norm": 0.3102557456709466,
"learning_rate": 2.0126186583728855e-07,
"loss": -0.0,
"num_tokens": 130180043.0,
"reward": 0.74609375,
"reward_std": 0.17464473843574524,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 828
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 3.764002631405744e-09,
"advantages/std": 0.618571937084198,
"advantages/var": 0.382631241348097,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.5421558164354323,
"grad_norm": 0.3208915683974791,
"learning_rate": 2.001909344622559e-07,
"loss": 0.0,
"num_tokens": 130344774.0,
"reward": 0.71484375,
"reward_std": 0.19043990969657898,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 829
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.9874931899534334e-09,
"advantages/std": 0.4676107168197632,
"advantages/var": 0.21865978248469276,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.5464247598719316,
"grad_norm": 0.2432981280364349,
"learning_rate": 1.9912214631591312e-07,
"loss": 0.0,
"num_tokens": 130477952.0,
"reward": 0.84765625,
"reward_std": 0.1234995573759079,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"step": 830
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.098457610141317e-09,
"advantages/std": 0.5726791620254517,
"advantages/var": 0.3279614226181735,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.550693703308431,
"grad_norm": 0.2968918749530196,
"learning_rate": 1.980555090386477e-07,
"loss": 0.0,
"num_tokens": 130637160.0,
"reward": 0.78125,
"reward_std": 0.154791921377182,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 831
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.9876278105623057e-09,
"advantages/std": 0.4675896465778351,
"advantages/var": 0.21864007758678472,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.5549626467449307,
"grad_norm": 0.19183500301052084,
"learning_rate": 1.9699103025547143e-07,
"loss": 0.0,
"num_tokens": 130791173.0,
"reward": 0.7421875,
"reward_std": 0.10258589684963226,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 832
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.49596190452575684,
"advantages/var": 0.24597821074081594,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.55923159018143,
"grad_norm": 0.24124343899636105,
"learning_rate": 1.959287175759653e-07,
"loss": 0.0,
"num_tokens": 130923799.0,
"reward": 0.83203125,
"reward_std": 0.1238841712474823,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"step": 833
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.907440772282189e-10,
"advantages/std": 0.5227778553962708,
"advantages/var": 0.2732966860927242,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.5635005336179297,
"grad_norm": 0.24767285169176734,
"learning_rate": 1.9486857859422607e-07,
"loss": -0.0,
"num_tokens": 131070507.0,
"reward": 0.7265625,
"reward_std": 0.12468298524618149,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 834
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.226770571388329e-09,
"advantages/std": 0.5227988958358765,
"advantages/var": 0.2733186854872116,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.567769477054429,
"grad_norm": 0.27492333608946673,
"learning_rate": 1.938106208888114e-07,
"loss": -0.0,
"num_tokens": 131231547.0,
"reward": 0.671875,
"reward_std": 0.14689238369464874,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 835
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.131201492978207e-10,
"advantages/std": 0.5726844668388367,
"advantages/var": 0.3279674985584826,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.5720384204909283,
"grad_norm": 0.3346074472616991,
"learning_rate": 1.927548520226857e-07,
"loss": -0.0,
"num_tokens": 131384872.0,
"reward": 0.71484375,
"reward_std": 0.15991923213005066,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 836
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.920008729382487e-09,
"advantages/std": 0.4373980462551117,
"advantages/var": 0.19131705086778883,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.576307363927428,
"grad_norm": 0.21789478595753164,
"learning_rate": 1.9170127954316645e-07,
"loss": 0.0,
"num_tokens": 131535364.0,
"reward": 0.71875,
"reward_std": 0.09719263017177582,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 837
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 1.953017746559513e-09,
"advantages/std": 0.5960791707038879,
"advantages/var": 0.3553103777470348,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.5805763073639274,
"grad_norm": 0.2817389613292503,
"learning_rate": 1.9064991098186934e-07,
"loss": -0.0,
"num_tokens": 131700679.0,
"reward": 0.640625,
"reward_std": 0.18463735282421112,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 838
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 9.447179218732682e-09,
"advantages/std": 0.36968278884887695,
"advantages/var": 0.13666536437108334,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.584845250800427,
"grad_norm": 0.21340090715046417,
"learning_rate": 1.8960075385465546e-07,
"loss": 0.0,
"num_tokens": 131845157.0,
"reward": 0.6875,
"reward_std": 0.08022041618824005,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 839
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 3.726129604088858e-09,
"advantages/std": 0.4374014437198639,
"advantages/var": 0.19132002296822126,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.5891141942369265,
"grad_norm": 0.22177987637345595,
"learning_rate": 1.8855381566157725e-07,
"loss": -0.0,
"num_tokens": 132005552.0,
"reward": 0.66796875,
"reward_std": 0.09954920411109924,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 840
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 5.749589180951444e-09,
"advantages/std": 0.4049517810344696,
"advantages/var": 0.16398594496298902,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.5933831376734258,
"grad_norm": 0.19025125246218424,
"learning_rate": 1.8750910388682427e-07,
"loss": -0.0,
"num_tokens": 132143786.0,
"reward": 0.828125,
"reward_std": 0.08219873160123825,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 841
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.1938662863450107e-09,
"advantages/std": 0.437395840883255,
"advantages/var": 0.19131512162196973,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.597652081109925,
"grad_norm": 0.19459458021058607,
"learning_rate": 1.8646662599867068e-07,
"loss": -0.0,
"num_tokens": 132293070.0,
"reward": 0.76171875,
"reward_std": 0.09442432969808578,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 842
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.0646268830368173e-09,
"advantages/std": 0.4373938739299774,
"advantages/var": 0.19131340095147298,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 3.601921024546425,
"grad_norm": 0.2134909524177928,
"learning_rate": 1.8542638944942125e-07,
"loss": -0.0,
"num_tokens": 132454528.0,
"reward": 0.66796875,
"reward_std": 0.09377524256706238,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 843
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 7.904298133858483e-09,
"advantages/std": 0.6185803413391113,
"advantages/var": 0.3826416386912115,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.606189967982924,
"grad_norm": 0.30309840432087204,
"learning_rate": 1.8438840167535824e-07,
"loss": 0.0,
"num_tokens": 132620507.0,
"reward": 0.6953125,
"reward_std": 0.20133629441261292,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 844
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.548316478729248,
"advantages/var": 0.3006509608460419,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.610458911419424,
"grad_norm": 0.2991431432555865,
"learning_rate": 1.8335267009668792e-07,
"loss": 0.0,
"num_tokens": 132778683.0,
"reward": 0.69140625,
"reward_std": 0.1618887335062027,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 845
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.3472194717341796e-09,
"advantages/std": 0.4959711730480194,
"advantages/var": 0.24598740449462841,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.614727854855923,
"grad_norm": 0.3224258068457957,
"learning_rate": 1.8231920211748818e-07,
"loss": 0.0,
"num_tokens": 132930609.0,
"reward": 0.7109375,
"reward_std": 0.133487269282341,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 846
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.987660943944554e-09,
"advantages/std": 0.4675844609737396,
"advantages/var": 0.21863522814410263,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.6189967982924225,
"grad_norm": 0.22463884937510695,
"learning_rate": 1.812880051256551e-07,
"loss": 0.0,
"num_tokens": 133084022.0,
"reward": 0.75,
"reward_std": 0.0974610298871994,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 847
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 3.1939374485847554e-09,
"advantages/std": 0.43738609552383423,
"advantages/var": 0.19130659655758464,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.6232657417289222,
"grad_norm": 0.24305824007115434,
"learning_rate": 1.8025908649285032e-07,
"loss": 0.0,
"num_tokens": 133223806.0,
"reward": 0.703125,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 848
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.855440276329792e-09,
"advantages/std": 0.4373944401741028,
"advantages/var": 0.19131389629521678,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.6275346851654215,
"grad_norm": 0.2853846954871858,
"learning_rate": 1.7923245357444843e-07,
"loss": 0.0,
"num_tokens": 133358507.0,
"reward": 0.7578125,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 849
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.388932047240732e-10,
"advantages/std": 0.4959683120250702,
"advantages/var": 0.24598456653299738,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.6318036286019213,
"grad_norm": 0.25126654517040875,
"learning_rate": 1.7820811370948368e-07,
"loss": -0.0,
"num_tokens": 133503879.0,
"reward": 0.75390625,
"reward_std": 0.12954190373420715,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 850
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.2187931845011295e-09,
"advantages/std": 0.5483078360557556,
"advantages/var": 0.3006414830801454,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 3.6360725720384206,
"grad_norm": 0.3011783803267109,
"learning_rate": 1.7718607422059879e-07,
"loss": -0.0,
"num_tokens": 133673186.0,
"reward": 0.625,
"reward_std": 0.15216940641403198,
"rewards/drgrpo_math_reward/mean": 0.625,
"rewards/drgrpo_math_reward/std": 0.4850712716579437,
"step": 851
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 5.7896166871692685e-09,
"advantages/std": 0.522797703742981,
"advantages/var": 0.2733174390389337,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.64034151547492,
"grad_norm": 0.2899610395590994,
"learning_rate": 1.7616634241399176e-07,
"loss": -0.0,
"num_tokens": 133816736.0,
"reward": 0.72265625,
"reward_std": 0.14518490433692932,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 852
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.0327963533730878e-09,
"advantages/std": 0.5726855993270874,
"advantages/var": 0.3279687956766253,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.644610458911419,
"grad_norm": 0.2765547383782989,
"learning_rate": 1.7514892557936307e-07,
"loss": -0.0,
"num_tokens": 133982791.0,
"reward": 0.6640625,
"reward_std": 0.16162671148777008,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 853
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.987660943944554e-09,
"advantages/std": 0.4675844609737396,
"advantages/var": 0.21863522814410263,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.648879402347919,
"grad_norm": 0.2622846551934317,
"learning_rate": 1.741338309898656e-07,
"loss": -0.0,
"num_tokens": 134133320.0,
"reward": 0.6953125,
"reward_std": 0.0974610298871994,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 854
},
{
"advantages/mean": -5.3551048040390015e-09,
"advantages/snr": 8.09801548684253e-09,
"advantages/std": 0.6612860560417175,
"advantages/var": 0.4372992479152096,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.6531483457844183,
"grad_norm": 0.36731981713267514,
"learning_rate": 1.7312106590205012e-07,
"loss": 0.0,
"num_tokens": 134284425.0,
"reward": 0.71484375,
"reward_std": 0.2217308133840561,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 855
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.492732023090665e-09,
"advantages/std": 0.5483056306838989,
"advantages/var": 0.30063906463966816,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.657417289220918,
"grad_norm": 0.2825978000496106,
"learning_rate": 1.7211063755581524e-07,
"loss": -0.0,
"num_tokens": 134442698.0,
"reward": 0.7265625,
"reward_std": 0.15045949816703796,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 856
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.691724344030994e-09,
"advantages/std": 0.5726962089538574,
"advantages/var": 0.3279809477501203,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.6616862326574173,
"grad_norm": 0.2549773013386139,
"learning_rate": 1.7110255317435502e-07,
"loss": 0.0,
"num_tokens": 134621058.0,
"reward": 0.6171875,
"reward_std": 0.17358636856079102,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 857
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.6918558552053295e-09,
"advantages/std": 0.5726829767227173,
"advantages/var": 0.32796579182799235,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 3.6659551760939166,
"grad_norm": 0.36316391513683494,
"learning_rate": 1.700968199641069e-07,
"loss": 0.0,
"num_tokens": 134775005.0,
"reward": 0.74609375,
"reward_std": 0.15927013754844666,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 858
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.408847130130579e-09,
"advantages/std": 0.3696690797805786,
"advantages/var": 0.1366552285458198,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.6702241195304164,
"grad_norm": 0.24753457132012033,
"learning_rate": 1.6909344511470114e-07,
"loss": 0.0,
"num_tokens": 134902976.0,
"reward": 0.78515625,
"reward_std": 0.06944026052951813,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 859
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.492850196788031e-10,
"advantages/std": 0.5482980012893677,
"advantages/var": 0.30063069821791544,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.6744930629669157,
"grad_norm": 0.3315546828284674,
"learning_rate": 1.6809243579890865e-07,
"loss": 0.0,
"num_tokens": 135037019.0,
"reward": 0.68359375,
"reward_std": 0.14032843708992004,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 860
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 8.492722790909057e-10,
"advantages/std": 0.5483062267303467,
"advantages/var": 0.30063971827127034,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.678762006403415,
"grad_norm": 0.286009008207847,
"learning_rate": 1.6709379917259025e-07,
"loss": 0.0,
"num_tokens": 135188376.0,
"reward": 0.66796875,
"reward_std": 0.15110613405704498,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 861
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.9724759189415973e-09,
"advantages/std": 0.5483019948005676,
"advantages/var": 0.3006350775022817,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.6830309498399147,
"grad_norm": 2.4436489261824925,
"learning_rate": 1.6609754237464473e-07,
"loss": 0.0,
"num_tokens": 135341019.0,
"reward": 0.734375,
"reward_std": 0.14651167392730713,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 862
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.1231809285315785e-09,
"advantages/std": 0.5483061671257019,
"advantages/var": 0.30063965290807815,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.687299893276414,
"grad_norm": 0.34011279543517225,
"learning_rate": 1.6510367252695878e-07,
"loss": -0.0,
"num_tokens": 135503240.0,
"reward": 0.55859375,
"reward_std": 0.15110613405704498,
"rewards/drgrpo_math_reward/mean": 0.55859375,
"rewards/drgrpo_math_reward/std": 0.4975275993347168,
"step": 863
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.1292238802003243e-09,
"advantages/std": 0.4374000132083893,
"advantages/var": 0.19131877155469912,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.6915688367129134,
"grad_norm": 0.24761728765685573,
"learning_rate": 1.6411219673435563e-07,
"loss": 0.0,
"num_tokens": 135635341.0,
"reward": 0.796875,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 864
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.11761315668422e-09,
"advantages/std": 0.5227763652801514,
"advantages/var": 0.27329512809552625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.695837780149413,
"grad_norm": 0.36749810582776,
"learning_rate": 1.631231220845437e-07,
"loss": 0.0,
"num_tokens": 135784445.0,
"reward": 0.765625,
"reward_std": 0.12244509160518646,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 865
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.016642767905576e-09,
"advantages/std": 0.5227813720703125,
"advantages/var": 0.2733003629837185,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.7001067235859124,
"grad_norm": 0.24979578320151785,
"learning_rate": 1.621364556480675e-07,
"loss": 0.0,
"num_tokens": 135939903.0,
"reward": 0.84765625,
"reward_std": 0.12810038030147552,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"step": 866
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.258521169625515e-09,
"advantages/std": 0.4373924732208252,
"advantages/var": 0.19131217563023029,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.704375667022412,
"grad_norm": 0.2654350357443024,
"learning_rate": 1.61152204478255e-07,
"loss": -0.0,
"num_tokens": 136095139.0,
"reward": 0.734375,
"reward_std": 0.09206776320934296,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 867
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.694500720521717e-10,
"advantages/std": 0.4959646463394165,
"advantages/var": 0.2459809304185825,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.7086446104589115,
"grad_norm": 0.2536901123402134,
"learning_rate": 1.6017037561116897e-07,
"loss": 0.0,
"num_tokens": 136238980.0,
"reward": 0.7421875,
"reward_std": 0.1261245161294937,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 868
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.1175925403475357e-09,
"advantages/std": 0.5227798223495483,
"advantages/var": 0.2732987426558253,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.712913553895411,
"grad_norm": 0.24456262286722547,
"learning_rate": 1.59190976065556e-07,
"loss": 0.0,
"num_tokens": 136409795.0,
"reward": 0.61328125,
"reward_std": 0.1258624941110611,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 869
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.661552881108902e-09,
"advantages/std": 0.43739622831344604,
"advantages/var": 0.19131546054282822,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.625,
"epoch": 3.71718249733191,
"grad_norm": 0.2562752288229779,
"learning_rate": 1.5821401284279567e-07,
"loss": 0.0,
"num_tokens": 136562123.0,
"reward": 0.6640625,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 870
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.9724516843147123e-09,
"advantages/std": 0.5483064651489258,
"advantages/var": 0.30063997972411016,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.72145144076841,
"grad_norm": 0.29765250685263855,
"learning_rate": 1.572394929268519e-07,
"loss": -0.0,
"num_tokens": 136710574.0,
"reward": 0.671875,
"reward_std": 0.15163654088974,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 871
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.972501446509269e-09,
"advantages/std": 0.5482972860336304,
"advantages/var": 0.3006299138718447,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.725720384204909,
"grad_norm": 0.23036154460610173,
"learning_rate": 1.5626742328422194e-07,
"loss": -0.0,
"num_tokens": 136864200.0,
"reward": 0.72265625,
"reward_std": 0.14085638523101807,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 872
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 5.163816746546357e-09,
"advantages/std": 0.495977520942688,
"advantages/var": 0.2459937012804545,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.729989327641409,
"grad_norm": 0.22976091980157884,
"learning_rate": 1.5529781086388688e-07,
"loss": 0.0,
"num_tokens": 137014888.0,
"reward": 0.77734375,
"reward_std": 0.13873080909252167,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 873
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.46759456396102905,
"advantages/var": 0.2186446762459049,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.65625,
"epoch": 3.734258271077908,
"grad_norm": 0.2551973930148244,
"learning_rate": 1.543306625972623e-07,
"loss": -0.0,
"num_tokens": 137174278.0,
"reward": 0.61328125,
"reward_std": 0.10718034207820892,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 874
},
{
"advantages/mean": -6.51925802230835e-09,
"advantages/snr": 1.0937042648124181e-08,
"advantages/std": 0.5960713624954224,
"advantages/var": 0.3553010691871492,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.7385272145144075,
"grad_norm": 0.2965880638380372,
"learning_rate": 1.5336598539814783e-07,
"loss": -0.0,
"num_tokens": 137310614.0,
"reward": 0.80078125,
"reward_std": 0.17662307620048523,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 875
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.259734082608678e-09,
"advantages/std": 0.3696504533290863,
"advantages/var": 0.136641457646399,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.7427961579509073,
"grad_norm": 0.21159351586605826,
"learning_rate": 1.5240378616267886e-07,
"loss": -0.0,
"num_tokens": 137445559.0,
"reward": 0.74609375,
"reward_std": 0.055242717266082764,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 876
},
{
"advantages/mean": 4.6566128730773926e-09,
"advantages/snr": 8.907189930094833e-09,
"advantages/std": 0.5227925777435303,
"advantages/var": 0.27331207934372515,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 3.7470651013874066,
"grad_norm": 0.28854439169914126,
"learning_rate": 1.5144407176927647e-07,
"loss": -0.0,
"num_tokens": 137600127.0,
"reward": 0.7109375,
"reward_std": 0.14111842215061188,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 877
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 7.041735353792631e-10,
"advantages/std": 0.33064383268356323,
"advantages/var": 0.10932534409167616,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.7513340448239063,
"grad_norm": 0.15326824587402674,
"learning_rate": 1.504868490785987e-07,
"loss": 0.0,
"num_tokens": 137757873.0,
"reward": 0.6484375,
"reward_std": 0.056153833866119385,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 878
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.85536048412357e-09,
"advantages/std": 0.4374004006385803,
"advantages/var": 0.19131911047879058,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.7556029882604056,
"grad_norm": 0.2227162347817429,
"learning_rate": 1.4953212493349076e-07,
"loss": -0.0,
"num_tokens": 137899606.0,
"reward": 0.73828125,
"reward_std": 0.09837213903665543,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 879
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.877790810008545e-09,
"advantages/std": 0.49596714973449707,
"advantages/var": 0.24598341361576104,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.759871931696905,
"grad_norm": 0.27176674002899376,
"learning_rate": 1.4857990615893718e-07,
"loss": -0.0,
"num_tokens": 138061119.0,
"reward": 0.7265625,
"reward_std": 0.12953945994377136,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 880
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.34430603695948e-09,
"advantages/std": 0.5227933526039124,
"advantages/var": 0.27331288952683863,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.7641408751334042,
"grad_norm": 0.26342087753288634,
"learning_rate": 1.4763019956201251e-07,
"loss": 0.0,
"num_tokens": 138207026.0,
"reward": 0.7109375,
"reward_std": 0.14059044420719147,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 881
},
{
"advantages/mean": -5.122274160385132e-09,
"advantages/snr": 8.94427137290863e-09,
"advantages/std": 0.5726876854896545,
"advantages/var": 0.3279711851114975,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.768409818569904,
"grad_norm": 0.3527159478095035,
"learning_rate": 1.4668301193183196e-07,
"loss": 0.0,
"num_tokens": 138350855.0,
"reward": 0.859375,
"reward_std": 0.1633366346359253,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"step": 882
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.40494078397750854,
"advantages/var": 0.16397703852831924,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.7726787620064033,
"grad_norm": 0.22089051678841656,
"learning_rate": 1.4573835003950435e-07,
"loss": -0.0,
"num_tokens": 138499503.0,
"reward": 0.75,
"reward_std": 0.07312604784965515,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 883
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 5.646024076671005e-09,
"advantages/std": 0.6185697317123413,
"advantages/var": 0.3826285129906779,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.776947705442903,
"grad_norm": 0.33332968074776875,
"learning_rate": 1.4479622063808239e-07,
"loss": 0.0,
"num_tokens": 138659563.0,
"reward": 0.6484375,
"reward_std": 0.18819957971572876,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 884
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.1175139874944796e-09,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.7812166488794023,
"grad_norm": 0.2913051287443507,
"learning_rate": 1.438566304625151e-07,
"loss": -0.0,
"num_tokens": 138818903.0,
"reward": 0.66015625,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 885
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.6945190564897304e-09,
"advantages/std": 0.4959627091884613,
"advantages/var": 0.24597900890555824,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 3.7854855923159016,
"grad_norm": 0.30882217106637433,
"learning_rate": 1.429195862295997e-07,
"loss": -0.0,
"num_tokens": 138970507.0,
"reward": 0.62109375,
"reward_std": 0.12335620820522308,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 886
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.164009129592525e-09,
"advantages/std": 0.4959590435028076,
"advantages/var": 0.24597537283221982,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 3.7897545357524014,
"grad_norm": 0.23766551699507737,
"learning_rate": 1.4198509463793273e-07,
"loss": -0.0,
"num_tokens": 139131572.0,
"reward": 0.6328125,
"reward_std": 0.11993881314992905,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 887
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.528024122922025e-10,
"advantages/std": 0.6185703873634338,
"advantages/var": 0.3826293241229486,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.7940234791889007,
"grad_norm": 0.33804980606057317,
"learning_rate": 1.410531623678633e-07,
"loss": -0.0,
"num_tokens": 139303132.0,
"reward": 0.5625,
"reward_std": 0.18767160177230835,
"rewards/drgrpo_math_reward/mean": 0.5625,
"rewards/drgrpo_math_reward/std": 0.49705013632774353,
"step": 888
},
{
"advantages/mean": -6.752088665962219e-09,
"advantages/snr": 1.2915415091048558e-08,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.7982924226254005,
"grad_norm": 0.33630546338820555,
"learning_rate": 1.4012379608144475e-07,
"loss": 0.0,
"num_tokens": 139445873.0,
"reward": 0.76171875,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 889
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.6262340360707772e-09,
"advantages/std": 0.5726866722106934,
"advantages/var": 0.32797002452775814,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.8025613660618998,
"grad_norm": 0.31992935383858606,
"learning_rate": 1.3919700242238712e-07,
"loss": 0.0,
"num_tokens": 139616331.0,
"reward": 0.57421875,
"reward_std": 0.1633341759443283,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 890
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.246379398265418e-09,
"advantages/std": 0.5483039021492004,
"advantages/var": 0.30063716911203997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.806830309498399,
"grad_norm": 0.3029778902195819,
"learning_rate": 1.3827278801600978e-07,
"loss": 0.0,
"num_tokens": 139762906.0,
"reward": 0.78515625,
"reward_std": 0.14769119024276733,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 891
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.3360704161248273e-09,
"advantages/std": 0.5227957367897034,
"advantages/var": 0.2733153824054888,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.8110992529348984,
"grad_norm": 0.24190131077280838,
"learning_rate": 1.373511594691934e-07,
"loss": -0.0,
"num_tokens": 139923060.0,
"reward": 0.7265625,
"reward_std": 0.14400538802146912,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 892
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4938013376916182e-09,
"advantages/std": 0.46759358048439026,
"advantages/var": 0.21864375651021195,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.815368196371398,
"grad_norm": 0.213665408572706,
"learning_rate": 1.3643212337033393e-07,
"loss": -0.0,
"num_tokens": 140072958.0,
"reward": 0.75390625,
"reward_std": 0.1060032844543457,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 893
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.2727493096958816e-09,
"advantages/std": 0.6402799487113953,
"advantages/var": 0.40995841272186695,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.8196371398078974,
"grad_norm": 0.29694736667971106,
"learning_rate": 1.3551568628929432e-07,
"loss": 0.0,
"num_tokens": 140234033.0,
"reward": 0.67578125,
"reward_std": 0.2014860063791275,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 894
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.46761050820350647,
"advantages/var": 0.2186595873823416,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.823906083244397,
"grad_norm": 0.19275105427008563,
"learning_rate": 1.346018547773582e-07,
"loss": 0.0,
"num_tokens": 140380833.0,
"reward": 0.7265625,
"reward_std": 0.12138035148382187,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 895
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.659128645842666e-09,
"advantages/std": 0.5726706981658936,
"advantages/var": 0.32795172853781196,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 3.8281750266808965,
"grad_norm": 0.26645263914606415,
"learning_rate": 1.3369063536718344e-07,
"loss": 0.0,
"num_tokens": 140522260.0,
"reward": 0.796875,
"reward_std": 0.14454218745231628,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 896
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.991712901920792e-09,
"advantages/std": 0.4675987958908081,
"advantages/var": 0.21864863391853362,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.832443970117396,
"grad_norm": 0.2393996622425807,
"learning_rate": 1.3278203457275399e-07,
"loss": -0.0,
"num_tokens": 140676901.0,
"reward": 0.77734375,
"reward_std": 0.11112815886735916,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 897
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.5629629032543266e-09,
"advantages/std": 0.5227798223495483,
"advantages/var": 0.2732987426558253,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.8367129135538955,
"grad_norm": 0.24755508461462025,
"learning_rate": 1.3187605888933505e-07,
"loss": 0.0,
"num_tokens": 140835149.0,
"reward": 0.70703125,
"reward_std": 0.1258624941110611,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 898
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.453673669766467e-09,
"advantages/std": 0.5227833390235901,
"advantages/var": 0.27330241956065393,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.840981856990395,
"grad_norm": 0.2601741221037382,
"learning_rate": 1.3097271479342525e-07,
"loss": 0.0,
"num_tokens": 140996194.0,
"reward": 0.6328125,
"reward_std": 0.12927988171577454,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 899
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 5.323127886632988e-09,
"advantages/std": 0.4373944103717804,
"advantages/var": 0.19131387022447743,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.8452508004268946,
"grad_norm": 0.23103748082036624,
"learning_rate": 1.3007200874271124e-07,
"loss": 0.0,
"num_tokens": 141132943.0,
"reward": 0.7890625,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 900
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.633436069767149e-09,
"advantages/std": 0.4959615468978882,
"advantages/var": 0.24597785600134614,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.849519743863394,
"grad_norm": 0.23474980568006226,
"learning_rate": 1.291739471760212e-07,
"loss": 0.0,
"num_tokens": 141282228.0,
"reward": 0.7734375,
"reward_std": 0.1233537495136261,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 901
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.2268269331908505e-09,
"advantages/std": 0.5227856636047363,
"advantages/var": 0.27330485007064453,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 3.853788687299893,
"grad_norm": 0.25644152549883725,
"learning_rate": 1.282785365132788e-07,
"loss": -0.0,
"num_tokens": 141449187.0,
"reward": 0.5390625,
"reward_std": 0.13269484043121338,
"rewards/drgrpo_math_reward/mean": 0.5390625,
"rewards/drgrpo_math_reward/std": 0.4994482398033142,
"step": 902
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5960760116577148,
"advantages/var": 0.3553066116737682,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.8580576307363925,
"grad_norm": 0.3059233842690013,
"learning_rate": 1.273857831554575e-07,
"loss": -0.0,
"num_tokens": 141613248.0,
"reward": 0.72265625,
"reward_std": 0.1828087568283081,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 903
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4083371836861376e-09,
"advantages/std": 0.4959692358970642,
"advantages/var": 0.24598548295631772,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.8623265741728923,
"grad_norm": 0.26417244640023585,
"learning_rate": 1.2649569348453415e-07,
"loss": 0.0,
"num_tokens": 141786131.0,
"reward": 0.63671875,
"reward_std": 0.13071897625923157,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 904
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.408326774808376e-09,
"advantages/std": 0.4959729015827179,
"advantages/var": 0.24598911910438037,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.8665955176093916,
"grad_norm": 0.25801166081421384,
"learning_rate": 1.256082738634444e-07,
"loss": 0.0,
"num_tokens": 141934869.0,
"reward": 0.6875,
"reward_std": 0.134136363863945,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 905
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.117604270298766e-09,
"advantages/std": 0.5227778553962708,
"advantages/var": 0.2732966860927242,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 3.8708644610458913,
"grad_norm": 0.28898482123519487,
"learning_rate": 1.2472353063603623e-07,
"loss": 0.0,
"num_tokens": 142073269.0,
"reward": 0.8515625,
"reward_std": 0.12468297779560089,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"step": 906
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.987508612632739e-09,
"advantages/std": 0.4676083028316498,
"advantages/var": 0.2186575248770959,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 3.8751334044823906,
"grad_norm": 0.15786910018254705,
"learning_rate": 1.2384147012702518e-07,
"loss": -0.0,
"num_tokens": 142230015.0,
"reward": 0.75390625,
"reward_std": 0.12020084261894226,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 907
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.246344777610733e-10,
"advantages/std": 0.5483083724975586,
"advantages/var": 0.30064207135092147,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.87940234791889,
"grad_norm": 0.29409113629686395,
"learning_rate": 1.229620986419494e-07,
"loss": 0.0,
"num_tokens": 142384322.0,
"reward": 0.70703125,
"reward_std": 0.1528160572052002,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 908
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.024678068029527e-09,
"advantages/std": 0.4049552381038666,
"advantages/var": 0.16398874486775927,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.8836712913553897,
"grad_norm": 0.17047396269876772,
"learning_rate": 1.2208542246712344e-07,
"loss": 0.0,
"num_tokens": 142543955.0,
"reward": 0.67578125,
"reward_std": 0.0861440896987915,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 909
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.6721368717488085e-09,
"advantages/std": 0.5227965116500854,
"advantages/var": 0.27331619259349793,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.887940234791889,
"grad_norm": 0.3010578150999706,
"learning_rate": 1.2121144786959464e-07,
"loss": 0.0,
"num_tokens": 142716353.0,
"reward": 0.6953125,
"reward_std": 0.1434774398803711,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 910
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.812095983572427e-10,
"advantages/std": 0.5960772633552551,
"advantages/var": 0.3553081038890902,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 3.8922091782283887,
"grad_norm": 0.28912195447402445,
"learning_rate": 1.2034018109709716e-07,
"loss": -0.0,
"num_tokens": 142861296.0,
"reward": 0.78125,
"reward_std": 0.18463245034217834,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 911
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.694555446758157e-10,
"advantages/std": 0.4959588646888733,
"advantages/var": 0.24597519546347613,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.896478121664888,
"grad_norm": 0.2527869648597765,
"learning_rate": 1.1947162837800838e-07,
"loss": 0.0,
"num_tokens": 143018752.0,
"reward": 0.6171875,
"reward_std": 0.12152761220932007,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"step": 912
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.991751746761795e-09,
"advantages/std": 0.46758967638015747,
"advantages/var": 0.2186401054573004,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.9007470651013874,
"grad_norm": 0.2158798647469881,
"learning_rate": 1.1860579592130365e-07,
"loss": 0.0,
"num_tokens": 143148036.0,
"reward": 0.8359375,
"reward_std": 0.10258589684963226,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"step": 913
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 1.0615920798986244e-08,
"advantages/std": 0.548305332660675,
"advantages/var": 0.30063873782413353,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 3.9050160085378867,
"grad_norm": 0.2755074303952438,
"learning_rate": 1.1774268991651209e-07,
"loss": 0.0,
"num_tokens": 143309921.0,
"reward": 0.81640625,
"reward_std": 0.14992906153202057,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 914
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.8181813666449564e-09,
"advantages/std": 0.6402844190597534,
"advantages/var": 0.4099641372906859,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.9092849519743864,
"grad_norm": 0.3347062868224139,
"learning_rate": 1.168823165336727e-07,
"loss": -0.0,
"num_tokens": 143490099.0,
"reward": 0.55859375,
"reward_std": 0.20608291029930115,
"rewards/drgrpo_math_reward/mean": 0.55859375,
"rewards/drgrpo_math_reward/std": 0.4975275993347168,
"step": 915
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4083547014131486e-09,
"advantages/std": 0.49596306681632996,
"advantages/var": 0.24597936364585937,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.9135538954108857,
"grad_norm": 0.26389252496546856,
"learning_rate": 1.1602468192328934e-07,
"loss": -0.0,
"num_tokens": 143624497.0,
"reward": 0.796875,
"reward_std": 0.12388662248849869,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 916
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.0646119401001621e-09,
"advantages/std": 0.4374000132083893,
"advantages/var": 0.19131877155469912,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 3.9178228388473855,
"grad_norm": 0.1934175385847749,
"learning_rate": 1.1516979221628803e-07,
"loss": 0.0,
"num_tokens": 143787782.0,
"reward": 0.75,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 917
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5726915597915649,
"advantages/var": 0.3279756226564956,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.9220917822838848,
"grad_norm": 0.2799402291849327,
"learning_rate": 1.1431765352397166e-07,
"loss": 0.0,
"num_tokens": 143928082.0,
"reward": 0.80859375,
"reward_std": 0.16951988637447357,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 918
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.453658436377939e-09,
"advantages/std": 0.5227851271629333,
"advantages/var": 0.2733042891827644,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 3.926360725720384,
"grad_norm": 0.2900591127626478,
"learning_rate": 1.1346827193797797e-07,
"loss": -0.0,
"num_tokens": 144068591.0,
"reward": 0.76171875,
"reward_std": 0.13204818964004517,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 919
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1231809285315785e-09,
"advantages/std": 0.5483061671257019,
"advantages/var": 0.30063965290807815,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.930629669156884,
"grad_norm": 0.26515987521507567,
"learning_rate": 1.1262165353023472e-07,
"loss": 0.0,
"num_tokens": 144226666.0,
"reward": 0.71484375,
"reward_std": 0.1511061191558838,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 920
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 5.163944896884701e-09,
"advantages/std": 0.49596521258354187,
"advantages/var": 0.24598149209303788,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 3.934898612593383,
"grad_norm": 0.23332221023622182,
"learning_rate": 1.117778043529164e-07,
"loss": -0.0,
"num_tokens": 144374744.0,
"reward": 0.73828125,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 921
},
{
"advantages/mean": -6.05359673500061e-09,
"advantages/snr": 9.15431138117741e-09,
"advantages/std": 0.6612836718559265,
"advantages/var": 0.4372960946632567,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.9391675560298824,
"grad_norm": 0.3211763858336168,
"learning_rate": 1.1093673043840179e-07,
"loss": 0.0,
"num_tokens": 144545000.0,
"reward": 0.57421875,
"reward_std": 0.2205488383769989,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 922
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.225008758413878e-09,
"advantages/std": 0.4959695637226105,
"advantages/var": 0.24598580813919657,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.943436499466382,
"grad_norm": 0.32812558235268086,
"learning_rate": 1.1009843779922978e-07,
"loss": 0.0,
"num_tokens": 144688710.0,
"reward": 0.6875,
"reward_std": 0.13124938309192657,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 923
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.4855557819877025e-09,
"advantages/std": 0.46759098768234253,
"advantages/var": 0.2186413317617486,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 3.9477054429028815,
"grad_norm": 0.24068333353653437,
"learning_rate": 1.0926293242805735e-07,
"loss": 0.0,
"num_tokens": 144847224.0,
"reward": 0.80859375,
"reward_std": 0.10429336875677109,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 924
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 3.2727621056868287e-09,
"advantages/std": 0.6402774453163147,
"advantages/var": 0.40995520698078636,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 3.951974386339381,
"grad_norm": 0.35789550154522065,
"learning_rate": 1.0843022029761595e-07,
"loss": 0.0,
"num_tokens": 145004617.0,
"reward": 0.66015625,
"reward_std": 0.19701021909713745,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 925
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 5.323088352971216e-10,
"advantages/std": 0.43739765882492065,
"advantages/var": 0.1913167119455217,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 3.9562433297758806,
"grad_norm": 0.22273311099582965,
"learning_rate": 1.076003073606695e-07,
"loss": 0.0,
"num_tokens": 145158662.0,
"reward": 0.76953125,
"reward_std": 0.09666222333908081,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 926
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.694597479630824e-09,
"advantages/std": 0.4959544241428375,
"advantages/var": 0.24597079082685358,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.96051227321238,
"grad_norm": 0.2734087805574896,
"learning_rate": 1.0677319954997127e-07,
"loss": 0.0,
"num_tokens": 145303572.0,
"reward": 0.77734375,
"reward_std": 0.11534436047077179,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 927
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.2524220349498645e-09,
"advantages/std": 0.5726947784423828,
"advantages/var": 0.32797930925516994,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 3.9647812166488796,
"grad_norm": 0.28946305611975287,
"learning_rate": 1.0594890277822149e-07,
"loss": 0.0,
"num_tokens": 145479142.0,
"reward": 0.5546875,
"reward_std": 0.17464229464530945,
"rewards/drgrpo_math_reward/mean": 0.5546875,
"rewards/drgrpo_math_reward/std": 0.49797385931015015,
"step": 928
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 7.636414345071518e-09,
"advantages/std": 0.64028000831604,
"advantages/var": 0.4099584890491883,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 3.969050160085379,
"grad_norm": 0.37535894555351246,
"learning_rate": 1.0512742293802556e-07,
"loss": 0.0,
"num_tokens": 145634602.0,
"reward": 0.80859375,
"reward_std": 0.19819219410419464,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 929
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.898892464424674e-09,
"advantages/std": 0.5227991938591003,
"advantages/var": 0.2733189970997252,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.9733191035218782,
"grad_norm": 0.24714138330863633,
"learning_rate": 1.0430876590185162e-07,
"loss": 0.0,
"num_tokens": 145789751.0,
"reward": 0.72265625,
"reward_std": 0.14742279052734375,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 930
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.344368797067637e-09,
"advantages/std": 0.5227872133255005,
"advantages/var": 0.27330647041664236,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.9775880469583775,
"grad_norm": 0.34603444556490903,
"learning_rate": 1.034929375219884e-07,
"loss": 0.0,
"num_tokens": 145929123.0,
"reward": 0.8515625,
"reward_std": 0.1349327117204666,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"step": 931
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.49293144311269e-09,
"advantages/std": 0.5482927560806274,
"advantages/var": 0.3006249463704904,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 3.9818569903948773,
"grad_norm": 0.281880223070927,
"learning_rate": 1.0267994363050386e-07,
"loss": 0.0,
"num_tokens": 146071625.0,
"reward": 0.83984375,
"reward_std": 0.1357315182685852,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"step": 932
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.547752029076837e-09,
"advantages/std": 0.5483201742172241,
"advantages/var": 0.300655013453607,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 3.9861259338313766,
"grad_norm": 0.24291507723078548,
"learning_rate": 1.0186979003920271e-07,
"loss": 0.0,
"num_tokens": 146228741.0,
"reward": 0.69140625,
"reward_std": 0.16754156351089478,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 933
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 4.8989756861545536e-09,
"advantages/std": 0.5227903127670288,
"advantages/var": 0.2733097111230478,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 3.9903948772678763,
"grad_norm": 0.24655260154008715,
"learning_rate": 1.0106248253958604e-07,
"loss": 0.0,
"num_tokens": 146376853.0,
"reward": 0.7890625,
"reward_std": 0.1361146867275238,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 934
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.439360447893462e-09,
"advantages/std": 0.5726844668388367,
"advantages/var": 0.3279674985584826,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.9946638207043756,
"grad_norm": 0.30695823633009073,
"learning_rate": 1.0025802690280849e-07,
"loss": 0.0,
"num_tokens": 146515186.0,
"reward": 0.78515625,
"reward_std": 0.15991923213005066,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 935
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 7.474437331114142e-09,
"advantages/std": 0.40495333075523376,
"advantages/var": 0.16398720008975776,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 3.998932764140875,
"grad_norm": 0.19395846323411928,
"learning_rate": 9.94564288796384e-08,
"loss": 0.0,
"num_tokens": 146666516.0,
"reward": 0.79296875,
"reward_std": 0.08390620350837708,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 936
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.975255240285385e-09,
"advantages/std": 0.46758967638015747,
"advantages/var": 0.2186401054573004,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.004268943436499,
"grad_norm": 0.2328061525623079,
"learning_rate": 9.865769420041559e-08,
"loss": 0.0,
"num_tokens": 146800260.0,
"reward": 0.7890625,
"reward_std": 0.10258589684963226,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 937
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5624185717793762e-09,
"advantages/std": 0.5960775017738342,
"advantages/var": 0.35530838812093535,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.008537886872999,
"grad_norm": 0.3494739089978992,
"learning_rate": 9.786182857501118e-08,
"loss": -0.0,
"num_tokens": 146955997.0,
"reward": 0.57421875,
"reward_std": 0.1834578514099121,
"rewards/drgrpo_math_reward/mean": 0.57421875,
"rewards/drgrpo_math_reward/std": 0.49542948603630066,
"step": 938
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.90732195053758e-10,
"advantages/std": 0.5227848291397095,
"advantages/var": 0.2733039775786352,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.012806830309499,
"grad_norm": 0.2601092761417996,
"learning_rate": 9.706883769278639e-08,
"loss": 0.0,
"num_tokens": 147109230.0,
"reward": 0.734375,
"reward_std": 0.13151776790618896,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 939
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.3971312155877077e-09,
"advantages/std": 0.5482994318008423,
"advantages/var": 0.3006322669131265,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.017075773745998,
"grad_norm": 0.3054328373484479,
"learning_rate": 9.627872722255154e-08,
"loss": 0.0,
"num_tokens": 147255304.0,
"reward": 0.81640625,
"reward_std": 0.14256632328033447,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 940
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.877782911581517e-09,
"advantages/std": 0.4959692358970642,
"advantages/var": 0.24598548295631772,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.021344717182497,
"grad_norm": 0.26965248470222386,
"learning_rate": 9.549150281252632e-08,
"loss": 0.0,
"num_tokens": 147411097.0,
"reward": 0.69921875,
"reward_std": 0.13071897625923157,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 941
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 3.1175744124147052e-09,
"advantages/std": 0.5227828621864319,
"advantages/var": 0.27330192099583783,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 4.025613660618997,
"grad_norm": 0.21569709619664004,
"learning_rate": 9.470717009029888e-08,
"loss": -0.0,
"num_tokens": 147571275.0,
"reward": 0.66796875,
"reward_std": 0.13033825159072876,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 942
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.2738897237632814e-09,
"advantages/std": 0.5483142733573914,
"advantages/var": 0.3006485423674441,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.029882604055496,
"grad_norm": 0.30706035259478337,
"learning_rate": 9.39257346627857e-08,
"loss": 0.0,
"num_tokens": 147717314.0,
"reward": 0.71484375,
"reward_std": 0.16017881035804749,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 943
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.344317613955936e-09,
"advantages/std": 0.5227922201156616,
"advantages/var": 0.2733117054134624,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.034151547491995,
"grad_norm": 0.3296832962360881,
"learning_rate": 9.314720211619165e-08,
"loss": -0.0,
"num_tokens": 147872218.0,
"reward": 0.72265625,
"reward_std": 0.14058800041675568,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 944
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.7814613434419682e-09,
"advantages/std": 0.5227857232093811,
"advantages/var": 0.27330491239155563,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.0384204909284955,
"grad_norm": 0.3017122128114032,
"learning_rate": 9.237157801596957e-08,
"loss": 0.0,
"num_tokens": 148004908.0,
"reward": 0.765625,
"reward_std": 0.13269484043121338,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 945
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.5629543724332273e-09,
"advantages/std": 0.5227810740470886,
"advantages/var": 0.27330005138182756,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.042689434364995,
"grad_norm": 0.3017288715106481,
"learning_rate": 9.159886790678123e-08,
"loss": 0.0,
"num_tokens": 148171117.0,
"reward": 0.640625,
"reward_std": 0.12756995856761932,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 946
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.131207417015343e-10,
"advantages/std": 0.5726840496063232,
"advantages/var": 0.3279670206734977,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.046958377801494,
"grad_norm": 0.24636777783457547,
"learning_rate": 9.082907731245731e-08,
"loss": -0.0,
"num_tokens": 148333037.0,
"reward": 0.7109375,
"reward_std": 0.16097763180732727,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 947
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.492724637343774e-09,
"advantages/std": 0.5483061075210571,
"advantages/var": 0.30063958754489306,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.0512273212379935,
"grad_norm": 0.2643422583957601,
"learning_rate": 9.00622117359574e-08,
"loss": 0.0,
"num_tokens": 148491408.0,
"reward": 0.68359375,
"reward_std": 0.14940111339092255,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 948
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.408867390103578e-09,
"advantages/std": 0.3696673810482025,
"advantages/var": 0.13665397261103696,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 4.055496264674493,
"grad_norm": 0.24213778856531787,
"learning_rate": 8.929827665933209e-08,
"loss": -0.0,
"num_tokens": 148643434.0,
"reward": 0.6640625,
"reward_std": 0.0677327960729599,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 949
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.453571100295136e-10,
"advantages/std": 0.5227953791618347,
"advantages/var": 0.2733150084729665,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.059765208110993,
"grad_norm": 0.28065996813370997,
"learning_rate": 8.85372775436819e-08,
"loss": 0.0,
"num_tokens": 148796153.0,
"reward": 0.76171875,
"reward_std": 0.1434749811887741,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 950
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.966894768009674e-09,
"advantages/std": 0.46759626269340515,
"advantages/var": 0.21864626488483996,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.064034151547492,
"grad_norm": 0.29909557937352554,
"learning_rate": 8.777921982911996e-08,
"loss": 0.0,
"num_tokens": 148926152.0,
"reward": 0.79296875,
"reward_std": 0.10941822826862335,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 951
},
{
"advantages/mean": 3.725290298461914e-09,
"advantages/snr": 9.19955663384395e-09,
"advantages/std": 0.4049423635005951,
"advantages/var": 0.1639783177574481,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 4.068303094983992,
"grad_norm": 0.22336800428426953,
"learning_rate": 8.702410893473173e-08,
"loss": -0.0,
"num_tokens": 149081237.0,
"reward": 0.66015625,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 952
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.987508612632739e-09,
"advantages/std": 0.4676083028316498,
"advantages/var": 0.2186575248770959,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.072572038420491,
"grad_norm": 0.26231833502338076,
"learning_rate": 8.627195025853734e-08,
"loss": -0.0,
"num_tokens": 149230995.0,
"reward": 0.75390625,
"reward_std": 0.12020084261894226,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 953
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.25859719291336e-09,
"advantages/std": 0.4373846650123596,
"advantages/var": 0.19130534518797404,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.703125,
"epoch": 4.07684098185699,
"grad_norm": 0.24172333880642313,
"learning_rate": 8.552274917745244e-08,
"loss": 0.0,
"num_tokens": 149399147.0,
"reward": 0.60546875,
"reward_std": 0.08417458832263947,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 954
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5624051357954486e-09,
"advantages/std": 0.5960826277732849,
"advantages/var": 0.35531449913310453,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.0811099252934895,
"grad_norm": 0.3066328634254312,
"learning_rate": 8.477651104724992e-08,
"loss": -0.0,
"num_tokens": 149565867.0,
"reward": 0.6484375,
"reward_std": 0.1902901977300644,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 955
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.5056154153628903e-09,
"advantages/std": 0.6185660362243652,
"advantages/var": 0.3826239411703227,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.08537886872999,
"grad_norm": 0.31224449036267354,
"learning_rate": 8.403324120252159e-08,
"loss": -0.0,
"num_tokens": 149723469.0,
"reward": 0.67578125,
"reward_std": 0.1836051195859909,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 956
},
{
"advantages/mean": -4.889443516731262e-09,
"advantages/snr": 1.0456236160976e-08,
"advantages/std": 0.46761026978492737,
"advantages/var": 0.21865936440833256,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.089647812166489,
"grad_norm": 0.26791288036125843,
"learning_rate": 8.32929449566398e-08,
"loss": 0.0,
"num_tokens": 149865420.0,
"reward": 0.69921875,
"reward_std": 0.1225549504160881,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 957
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.69179720757951e-09,
"advantages/std": 0.57268887758255,
"advantages/var": 0.327972550506761,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.093916755602988,
"grad_norm": 0.26559537702277175,
"learning_rate": 8.255562760172003e-08,
"loss": 0.0,
"num_tokens": 150021469.0,
"reward": 0.77734375,
"reward_std": 0.16674911975860596,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 958
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 5.077871021256398e-09,
"advantages/std": 0.596076250076294,
"advantages/var": 0.3553068959050165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.098185699039488,
"grad_norm": 0.2941006947409124,
"learning_rate": 8.182129440858259e-08,
"loss": -0.0,
"num_tokens": 150179899.0,
"reward": 0.77734375,
"reward_std": 0.18292498588562012,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 959
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.8895334850316917e-09,
"advantages/std": 0.36966368556022644,
"advantages/var": 0.13665124042196997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.102454642475987,
"grad_norm": 0.2398030583187631,
"learning_rate": 8.10899506267148e-08,
"loss": 0.0,
"num_tokens": 150311781.0,
"reward": 0.83984375,
"reward_std": 0.06549245119094849,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"step": 960
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.5628731285651194e-09,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 4.106723585912487,
"grad_norm": 0.2716576868483362,
"learning_rate": 8.036160148423449e-08,
"loss": 0.0,
"num_tokens": 150470368.0,
"reward": 0.69921875,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 961
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.2597100125149844e-09,
"advantages/std": 0.3696575164794922,
"advantages/var": 0.13664667948978604,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 4.110992529348986,
"grad_norm": 0.18986091291824578,
"learning_rate": 7.963625218785097e-08,
"loss": 0.0,
"num_tokens": 150612347.0,
"reward": 0.7109375,
"reward_std": 0.06089799851179123,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 962
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 1.881998232812962e-09,
"advantages/std": 0.6185729503631592,
"advantages/var": 0.3826324949209834,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.115261472785486,
"grad_norm": 0.3400034315594602,
"learning_rate": 7.891390792282926e-08,
"loss": 0.0,
"num_tokens": 150759230.0,
"reward": 0.7265625,
"reward_std": 0.1921473890542984,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 963
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.8777980314856883e-09,
"advantages/std": 0.49596524238586426,
"advantages/var": 0.24598152165486908,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.119530416221985,
"grad_norm": 0.33934023630760835,
"learning_rate": 7.819457385295252e-08,
"loss": 0.0,
"num_tokens": 150910087.0,
"reward": 0.62109375,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.62109375,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"step": 964
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.3278015020627767e-09,
"advantages/std": 0.701401948928833,
"advantages/var": 0.49196469396116527,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.123799359658484,
"grad_norm": 0.3693397260938956,
"learning_rate": 7.747825512048461e-08,
"loss": 0.0,
"num_tokens": 151068328.0,
"reward": 0.71484375,
"reward_std": 0.25619441270828247,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 965
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.633321316168079e-09,
"advantages/std": 0.4959716498851776,
"advantages/var": 0.2459878774898252,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.128068303094984,
"grad_norm": 0.3273626422965575,
"learning_rate": 7.676495684613432e-08,
"loss": 0.0,
"num_tokens": 151198824.0,
"reward": 0.83203125,
"reward_std": 0.13242888450622559,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"step": 966
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 2.1292784298048236e-09,
"advantages/std": 0.4373888075351715,
"advantages/var": 0.1913089689570393,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 4.132337246531484,
"grad_norm": 0.2688237297891008,
"learning_rate": 7.6054684129018e-08,
"loss": -0.0,
"num_tokens": 151343984.0,
"reward": 0.75,
"reward_std": 0.0875919908285141,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 967
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.726110817091556e-09,
"advantages/std": 0.4374036490917206,
"advantages/var": 0.19132195223875303,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.136606189967983,
"grad_norm": 0.25840927545905207,
"learning_rate": 7.534744204662347e-08,
"loss": 0.0,
"num_tokens": 151484365.0,
"reward": 0.765625,
"reward_std": 0.10231749713420868,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 968
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 6.2985861164521495e-09,
"advantages/std": 0.36965540051460266,
"advantages/var": 0.1366451151296113,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.140875133404482,
"grad_norm": 0.21189480775325778,
"learning_rate": 7.464323565477371e-08,
"loss": -0.0,
"num_tokens": 151613008.0,
"reward": 0.9140625,
"reward_std": 0.0586601160466671,
"rewards/drgrpo_math_reward/mean": 0.9140625,
"rewards/drgrpo_math_reward/std": 0.28082075715065,
"step": 969
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.2463817063291353e-10,
"advantages/std": 0.5483036041259766,
"advantages/var": 0.3006368422975356,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.145144076840982,
"grad_norm": 0.3253137810960524,
"learning_rate": 7.394206998759011e-08,
"loss": 0.0,
"num_tokens": 151759662.0,
"reward": 0.7265625,
"reward_std": 0.14716076850891113,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 970
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.008279339781276e-09,
"advantages/std": 0.5227868556976318,
"advantages/var": 0.27330609649021653,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.149413020277481,
"grad_norm": 0.2837713526169662,
"learning_rate": 7.324395005745771e-08,
"loss": 0.0,
"num_tokens": 151906750.0,
"reward": 0.73046875,
"reward_std": 0.13269728422164917,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 971
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.1939043697180337e-09,
"advantages/std": 0.43739062547683716,
"advantages/var": 0.19131055925501883,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.15368196371398,
"grad_norm": 0.22668461022016348,
"learning_rate": 7.254888085498812e-08,
"loss": -0.0,
"num_tokens": 152053010.0,
"reward": 0.7890625,
"reward_std": 0.08982987701892853,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 972
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.749723319311711e-10,
"advantages/std": 0.4049423336982727,
"advantages/var": 0.16397829362100325,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.1579509071504805,
"grad_norm": 0.22761884549822234,
"learning_rate": 7.185686734898477e-08,
"loss": -0.0,
"num_tokens": 152193920.0,
"reward": 0.68359375,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 973
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 4.0082519201446085e-09,
"advantages/std": 0.5227904319763184,
"advantages/var": 0.27330983576598555,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.16221985058698,
"grad_norm": 0.34005738823016896,
"learning_rate": 7.116791448640663e-08,
"loss": -0.0,
"num_tokens": 152334407.0,
"reward": 0.703125,
"reward_std": 0.13781970739364624,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 974
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.43738046288490295,
"advantages/var": 0.19130166931341197,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.166488794023479,
"grad_norm": 0.25319669034295045,
"learning_rate": 7.048202719233343e-08,
"loss": -0.0,
"num_tokens": 152475918.0,
"reward": 0.7734375,
"reward_std": 0.08075720071792603,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 975
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.453593949520833e-10,
"advantages/std": 0.5227926969528198,
"advantages/var": 0.2733122039872029,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.1707577374599785,
"grad_norm": 0.23322850370466372,
"learning_rate": 6.979921036993041e-08,
"loss": 0.0,
"num_tokens": 152627184.0,
"reward": 0.7734375,
"reward_std": 0.13952961564064026,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 976
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.8166796141722196e-09,
"advantages/std": 0.4959683120250702,
"advantages/var": 0.24598456653299738,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.175026680896478,
"grad_norm": 0.23281228909620943,
"learning_rate": 6.911946890041254e-08,
"loss": -0.0,
"num_tokens": 152793914.0,
"reward": 0.58984375,
"reward_std": 0.12954191863536835,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 977
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.1818942141274374e-09,
"advantages/std": 0.6402619481086731,
"advantages/var": 0.4099353621959132,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.179295624332978,
"grad_norm": 0.3735471434566129,
"learning_rate": 6.844280764301074e-08,
"loss": -0.0,
"num_tokens": 152958215.0,
"reward": 0.703125,
"reward_std": 0.1759803295135498,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 978
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.4676026999950409,
"advantages/var": 0.21865228504265222,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.183564567769477,
"grad_norm": 0.33626281644396355,
"learning_rate": 6.776923143493635e-08,
"loss": 0.0,
"num_tokens": 153131159.0,
"reward": 0.5546875,
"reward_std": 0.1145455539226532,
"rewards/drgrpo_math_reward/mean": 0.5546875,
"rewards/drgrpo_math_reward/std": 0.49797385931015015,
"step": 979
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.8778015294086476e-09,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.187833511205977,
"grad_norm": 0.2626485019509204,
"learning_rate": 6.709874509134682e-08,
"loss": 0.0,
"num_tokens": 153283971.0,
"reward": 0.66015625,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 980
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 6.298465769204987e-10,
"advantages/std": 0.36966246366500854,
"advantages/var": 0.13665033704288376,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.71875,
"epoch": 4.192102454642476,
"grad_norm": 0.1579751828880399,
"learning_rate": 6.643135340531136e-08,
"loss": 0.0,
"num_tokens": 153432473.0,
"reward": 0.73828125,
"reward_std": 0.06431539356708527,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 981
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.1291849036688268e-09,
"advantages/std": 0.6185806393623352,
"advantages/var": 0.3826420073939154,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.196371398078975,
"grad_norm": 0.37827716848300164,
"learning_rate": 6.576706114777625e-08,
"loss": -0.0,
"num_tokens": 153598469.0,
"reward": 0.63671875,
"reward_std": 0.20186671614646912,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 982
},
{
"advantages/mean": -5.122274160385132e-09,
"advantages/snr": 1.0954491476993248e-08,
"advantages/std": 0.46759578585624695,
"advantages/var": 0.21864581895052115,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.2006403415154745,
"grad_norm": 0.22489880299470943,
"learning_rate": 6.510587306753135e-08,
"loss": -0.0,
"num_tokens": 153753379.0,
"reward": 0.7890625,
"reward_std": 0.10718280076980591,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 983
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.3472301909481332e-09,
"advantages/std": 0.49596890807151794,
"advantages/var": 0.24598515777365382,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.204909284951975,
"grad_norm": 0.2759049680659352,
"learning_rate": 6.444779389117578e-08,
"loss": 0.0,
"num_tokens": 153906711.0,
"reward": 0.78125,
"reward_std": 0.13018855452537537,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 984
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 3.636393538163668e-10,
"advantages/std": 0.6402789950370789,
"advantages/var": 0.40995719148569165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.209178228388474,
"grad_norm": 0.2965646840634463,
"learning_rate": 6.379282832308414e-08,
"loss": -0.0,
"num_tokens": 154066904.0,
"reward": 0.6484375,
"reward_std": 0.19977852702140808,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"step": 985
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.7941619841441395e-09,
"advantages/std": 0.5483075380325317,
"advantages/var": 0.30064115626329624,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.213447171824973,
"grad_norm": 0.3275903862159375,
"learning_rate": 6.314098104537325e-08,
"loss": -0.0,
"num_tokens": 154224905.0,
"reward": 0.72265625,
"reward_std": 0.15163899958133698,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 986
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 4.2250465863671716e-09,
"advantages/std": 0.4959651231765747,
"advantages/var": 0.24598140340755492,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.217716115261473,
"grad_norm": 0.2526583838758858,
"learning_rate": 6.249225671786784e-08,
"loss": 0.0,
"num_tokens": 154379416.0,
"reward": 0.67578125,
"reward_std": 0.1250661313533783,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 987
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.5055895915341618e-09,
"advantages/std": 0.6185766458511353,
"advantages/var": 0.3826370667924408,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 4.221985058697972,
"grad_norm": 0.3235101551669753,
"learning_rate": 6.184665997806831e-08,
"loss": -0.0,
"num_tokens": 154560711.0,
"reward": 0.62890625,
"reward_std": 0.19674183428287506,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"step": 988
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4937900080004843e-09,
"advantages/std": 0.4675971269607544,
"advantages/var": 0.21864707314195186,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.226254002134472,
"grad_norm": 0.33109445872143445,
"learning_rate": 6.120419544111655e-08,
"loss": -0.0,
"num_tokens": 154708701.0,
"reward": 0.77734375,
"reward_std": 0.10889026522636414,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 989
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 3.659081040337769e-09,
"advantages/std": 0.5726781487464905,
"advantages/var": 0.3279602620517075,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.230522945570971,
"grad_norm": 0.24498457461398332,
"learning_rate": 6.056486769976388e-08,
"loss": -0.0,
"num_tokens": 154874139.0,
"reward": 0.64453125,
"reward_std": 0.15308444201946259,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"step": 990
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.5969216095717883e-09,
"advantages/std": 0.4373989999294281,
"advantages/var": 0.19131788513926384,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.234791889007471,
"grad_norm": 0.22464548027046374,
"learning_rate": 5.992868132433753e-08,
"loss": 0.0,
"num_tokens": 155028752.0,
"reward": 0.6953125,
"reward_std": 0.0966646745800972,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 991
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.8777983699937923e-09,
"advantages/std": 0.4959651529788971,
"advantages/var": 0.2459814329693808,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.23906083244397,
"grad_norm": 0.25152309433924797,
"learning_rate": 5.929564086270833e-08,
"loss": -0.0,
"num_tokens": 155187937.0,
"reward": 0.67578125,
"reward_std": 0.1250661313533783,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 992
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 3.9060405706929244e-10,
"advantages/std": 0.5960783958435059,
"advantages/var": 0.35530945399136726,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.243329775880469,
"grad_norm": 0.24942456745885597,
"learning_rate": 5.8665750840258156e-08,
"loss": 0.0,
"num_tokens": 155347820.0,
"reward": 0.6796875,
"reward_std": 0.18675413727760315,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 993
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 6.899481121099718e-09,
"advantages/std": 0.4049533009529114,
"advantages/var": 0.16398717595265921,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.247598719316969,
"grad_norm": 0.21808756592922232,
"learning_rate": 5.8039015759847207e-08,
"loss": 0.0,
"num_tokens": 155466166.0,
"reward": 0.83203125,
"reward_std": 0.08390620350837708,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"step": 994
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 8.907182821412798e-10,
"advantages/std": 0.5227929949760437,
"advantages/var": 0.27331251559602165,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.251867662753469,
"grad_norm": 0.2570254852272524,
"learning_rate": 5.74154401017824e-08,
"loss": -0.0,
"num_tokens": 155618360.0,
"reward": 0.72265625,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 995
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.3360734626754018e-09,
"advantages/std": 0.5227945446968079,
"advantages/var": 0.27331413596474263,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 4.256136606189968,
"grad_norm": 0.27912900097061305,
"learning_rate": 5.6795028323784964e-08,
"loss": -0.0,
"num_tokens": 155761125.0,
"reward": 0.76953125,
"reward_std": 0.1422979235649109,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 996
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 3.906087050636722e-10,
"advantages/std": 0.5960713028907776,
"advantages/var": 0.3553009981299091,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.2604055496264674,
"grad_norm": 0.2666367592047003,
"learning_rate": 5.61777848609587e-08,
"loss": 0.0,
"num_tokens": 155926364.0,
"reward": 0.73046875,
"reward_std": 0.17662307620048523,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 997
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 4.4536934733273465e-09,
"advantages/std": 0.5227810144424438,
"advantages/var": 0.2732999890614707,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.264674493062967,
"grad_norm": 0.27490102092925023,
"learning_rate": 5.5563714125758335e-08,
"loss": 0.0,
"num_tokens": 156075696.0,
"reward": 0.7421875,
"reward_std": 0.12756997346878052,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 998
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.125861622660058e-09,
"advantages/std": 0.5227845311164856,
"advantages/var": 0.2733036659746837,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.268943436499466,
"grad_norm": 0.27584388928748027,
"learning_rate": 5.495282050795763e-08,
"loss": 0.0,
"num_tokens": 156227052.0,
"reward": 0.74609375,
"reward_std": 0.13098736107349396,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 999
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.095582713468022e-09,
"advantages/std": 0.548311710357666,
"advantages/var": 0.30064573171534903,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.273212379935966,
"grad_norm": 0.302465854225609,
"learning_rate": 5.434510837461853e-08,
"loss": 0.0,
"num_tokens": 156386422.0,
"reward": 0.578125,
"reward_std": 0.15623345971107483,
"rewards/drgrpo_math_reward/mean": 0.578125,
"rewards/drgrpo_math_reward/std": 0.49482619762420654,
"step": 1000
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.00831498586979e-09,
"advantages/std": 0.5227822065353394,
"advantages/var": 0.2733012354699582,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.277481323372466,
"grad_norm": 0.3149036165515859,
"learning_rate": 5.3740582070059435e-08,
"loss": 0.0,
"num_tokens": 156520181.0,
"reward": 0.84765625,
"reward_std": 0.12927743792533875,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"step": 1001
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.7555872617828865e-09,
"advantages/std": 0.4959664046764374,
"advantages/var": 0.24598267456767164,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.281750266808965,
"grad_norm": 0.2360541009064038,
"learning_rate": 5.313924591582453e-08,
"loss": -0.0,
"num_tokens": 156682366.0,
"reward": 0.796875,
"reward_std": 0.12677361071109772,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"step": 1002
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 8.464772819654116e-09,
"advantages/std": 0.46759918332099915,
"advantages/var": 0.21864899624246537,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.286019210245464,
"grad_norm": 0.26427574271744264,
"learning_rate": 5.2541104210653e-08,
"loss": 0.0,
"num_tokens": 156818097.0,
"reward": 0.8359375,
"reward_std": 0.11165857315063477,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"step": 1003
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.8778015294086476e-09,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 4.2902881536819635,
"grad_norm": 0.21933021292288724,
"learning_rate": 5.1946161230447485e-08,
"loss": -0.0,
"num_tokens": 156972474.0,
"reward": 0.70703125,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 1004
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 6.324696116718118e-09,
"advantages/std": 0.4049423038959503,
"advantages/var": 0.16397826948456018,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.294557097118463,
"grad_norm": 0.21391185657647827,
"learning_rate": 5.135442122824452e-08,
"loss": 0.0,
"num_tokens": 157113545.0,
"reward": 0.80859375,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 1005
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 7.468992883293068e-09,
"advantages/std": 0.4675944447517395,
"advantages/var": 0.21864456476268757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.298826040554963,
"grad_norm": 0.1798798776554831,
"learning_rate": 5.076588843418345e-08,
"loss": 0.0,
"num_tokens": 157269558.0,
"reward": 0.81640625,
"reward_std": 0.10547532141208649,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 1006
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.516801416758646e-09,
"advantages/std": 0.6185721755027771,
"advantages/var": 0.3826315363062385,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.303094983991462,
"grad_norm": 0.3492523238461835,
"learning_rate": 5.018056705547652e-08,
"loss": 0.0,
"num_tokens": 157423276.0,
"reward": 0.70703125,
"reward_std": 0.190556138753891,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 1007
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.9916444828301443e-09,
"advantages/std": 0.4676148593425751,
"advantages/var": 0.21866365667797627,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.307363927427962,
"grad_norm": 0.2531791458957888,
"learning_rate": 4.9598461276378734e-08,
"loss": -0.0,
"num_tokens": 157568748.0,
"reward": 0.765625,
"reward_std": 0.12703317403793335,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 1008
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.022351402535043e-09,
"advantages/std": 0.6185773611068726,
"advantages/var": 0.3826379516739422,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.311632870864461,
"grad_norm": 0.3824698656940314,
"learning_rate": 4.9019575258157866e-08,
"loss": 0.0,
"num_tokens": 157717700.0,
"reward": 0.60546875,
"reward_std": 0.19621387124061584,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"step": 1009
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.193799697029419e-09,
"advantages/std": 0.43740496039390564,
"advantages/var": 0.19132309937719416,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.31590181430096,
"grad_norm": 0.2268254209772028,
"learning_rate": 4.844391313906482e-08,
"loss": 0.0,
"num_tokens": 157847100.0,
"reward": 0.80859375,
"reward_std": 0.10231995582580566,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 1010
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 4.02474204665107e-09,
"advantages/std": 0.40494880080223083,
"advantages/var": 0.16398353127116483,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.3201707577374595,
"grad_norm": 0.18082226148243413,
"learning_rate": 4.787147903430383e-08,
"loss": 0.0,
"num_tokens": 157980420.0,
"reward": 0.8125,
"reward_std": 0.08048880845308304,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"step": 1011
},
{
"advantages/mean": 3.4924596548080444e-09,
"advantages/snr": 6.098430951616284e-09,
"advantages/std": 0.5726816654205322,
"advantages/var": 0.3279642899088344,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.32443970117396,
"grad_norm": 0.28665244349419267,
"learning_rate": 4.7302277036003534e-08,
"loss": -0.0,
"num_tokens": 158142110.0,
"reward": 0.67578125,
"reward_std": 0.15873728692531586,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 1012
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.816740376924563e-09,
"advantages/std": 0.495957612991333,
"advantages/var": 0.24597395388406085,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.328708644610459,
"grad_norm": 0.2654031864670325,
"learning_rate": 4.673631121318672e-08,
"loss": -0.0,
"num_tokens": 158292692.0,
"reward": 0.70703125,
"reward_std": 0.11982014775276184,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 1013
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.5055969904057808e-09,
"advantages/std": 0.6185736060142517,
"advantages/var": 0.3826333060574747,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.332977588046958,
"grad_norm": 0.36422258318792977,
"learning_rate": 4.617358561174278e-08,
"loss": -0.0,
"num_tokens": 158452300.0,
"reward": 0.6796875,
"reward_std": 0.19161942601203918,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 1014
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.987587949498236e-09,
"advantages/std": 0.23379793763160706,
"advantages/var": 0.05466147564079282,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.337246531483458,
"grad_norm": 0.08889800429976145,
"learning_rate": 4.561410425439743e-08,
"loss": -0.0,
"num_tokens": 158586944.0,
"reward": 0.8203125,
"reward_std": 0.027221955358982086,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 1015
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.299886958035194e-09,
"advantages/std": 0.40494275093078613,
"advantages/var": 0.1639786315313927,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.341515474919957,
"grad_norm": 0.17475737149587717,
"learning_rate": 4.5057871140684325e-08,
"loss": -0.0,
"num_tokens": 158734807.0,
"reward": 0.6953125,
"reward_std": 0.07536394149065018,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 1016
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.520274014732149e-09,
"advantages/std": 0.5483058094978333,
"advantages/var": 0.3006392607290742,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.345784418356457,
"grad_norm": 0.3731423923229412,
"learning_rate": 4.450489024691689e-08,
"loss": 0.0,
"num_tokens": 158886692.0,
"reward": 0.6875,
"reward_std": 0.14887069165706635,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 1017
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 6.298465769204987e-10,
"advantages/std": 0.36966246366500854,
"advantages/var": 0.13665033704288376,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.350053361792956,
"grad_norm": 0.22982263969851446,
"learning_rate": 4.39551655261593e-08,
"loss": 0.0,
"num_tokens": 159030327.0,
"reward": 0.66796875,
"reward_std": 0.06431539356708527,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 1018
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 9.76531722867246e-09,
"advantages/std": 0.5960652232170105,
"advantages/var": 0.35529375032874455,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.354322305229456,
"grad_norm": 0.3434728947464965,
"learning_rate": 4.340870090819865e-08,
"loss": -0.0,
"num_tokens": 159188666.0,
"reward": 0.73828125,
"reward_std": 0.16978827118873596,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 1019
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.226825409861377e-09,
"advantages/std": 0.522786021232605,
"advantages/var": 0.2733052239962177,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.6875,
"epoch": 4.358591248665955,
"grad_norm": 0.3046441133239695,
"learning_rate": 4.286550029951674e-08,
"loss": 0.0,
"num_tokens": 159355247.0,
"reward": 0.69140625,
"reward_std": 0.1332252472639084,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 1020
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 5.323144933438982e-10,
"advantages/std": 0.4373930096626282,
"advantages/var": 0.19131264490173194,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.362860192102454,
"grad_norm": 0.21511357782167576,
"learning_rate": 4.232556758326211e-08,
"loss": 0.0,
"num_tokens": 159497734.0,
"reward": 0.87109375,
"reward_std": 0.09100939333438873,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"step": 1021
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4083371836861376e-09,
"advantages/std": 0.4959692358970642,
"advantages/var": 0.24598548295631772,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.3671291355389545,
"grad_norm": 0.25509993985117824,
"learning_rate": 4.178890661922241e-08,
"loss": 0.0,
"num_tokens": 159660252.0,
"reward": 0.63671875,
"reward_std": 0.13071897625923157,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 1022
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.9876135291586783e-09,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 4.371398078975454,
"grad_norm": 0.2952935455145491,
"learning_rate": 4.125552124379628e-08,
"loss": -0.0,
"num_tokens": 159797853.0,
"reward": 0.86328125,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"step": 1023
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.481332608054646e-09,
"advantages/std": 0.4676010310649872,
"advantages/var": 0.2186507242530391,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.375667022411953,
"grad_norm": 0.24696306722713282,
"learning_rate": 4.072541526996681e-08,
"loss": 0.0,
"num_tokens": 159956444.0,
"reward": 0.65625,
"reward_std": 0.11230766773223877,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 1024
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 7.64347128426064e-09,
"advantages/std": 0.5483047366142273,
"advantages/var": 0.30063808419359717,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.5,
"epoch": 4.3799359658484525,
"grad_norm": 0.2722365045836188,
"learning_rate": 4.019859248727342e-08,
"loss": 0.0,
"num_tokens": 160136653.0,
"reward": 0.61328125,
"reward_std": 0.14886824786663055,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 1025
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 3.821674987396077e-09,
"advantages/std": 0.5483134388923645,
"advantages/var": 0.30064762726997074,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.384204909284952,
"grad_norm": 0.29008508597364296,
"learning_rate": 3.967505666178555e-08,
"loss": -0.0,
"num_tokens": 160293294.0,
"reward": 0.65234375,
"reward_std": 0.15900175273418427,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 1026
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.2738974786069975e-09,
"advantages/std": 0.5483109354972839,
"advantages/var": 0.30064488198590666,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.388473852721451,
"grad_norm": 0.3482378111702543,
"learning_rate": 3.915481153607525e-08,
"loss": 0.0,
"num_tokens": 160445614.0,
"reward": 0.6875,
"reward_std": 0.15676140785217285,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 1027
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.794179709875018e-09,
"advantages/std": 0.5483061075210571,
"advantages/var": 0.30063958754489306,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 4.392742796157951,
"grad_norm": 0.25458533017295193,
"learning_rate": 3.8637860829190185e-08,
"loss": 0.0,
"num_tokens": 160605812.0,
"reward": 0.76171875,
"reward_std": 0.14940111339092255,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 1028
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.81670043231303e-09,
"advantages/std": 0.4959646463394165,
"advantages/var": 0.2459809304185825,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.397011739594451,
"grad_norm": 0.2933507838338875,
"learning_rate": 3.812420823662782e-08,
"loss": 0.0,
"num_tokens": 160746801.0,
"reward": 0.7421875,
"reward_std": 0.1261245161294937,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 1029
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.9833938148533065e-09,
"advantages/std": 0.46760255098342896,
"advantages/var": 0.21865214568621028,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.40128068303095,
"grad_norm": 0.30265927689089706,
"learning_rate": 3.76138574303082e-08,
"loss": 0.0,
"num_tokens": 160887984.0,
"reward": 0.7890625,
"reward_std": 0.11613436043262482,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 1030
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 5.174635975246255e-09,
"advantages/std": 0.4049513339996338,
"advantages/var": 0.16398558290808296,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.405549626467449,
"grad_norm": 0.20903610857543548,
"learning_rate": 3.7106812058548375e-08,
"loss": 0.0,
"num_tokens": 161046011.0,
"reward": 0.69140625,
"reward_std": 0.08166831731796265,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 1031
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 4.8786649560713735e-09,
"advantages/std": 0.286345511674881,
"advantages/var": 0.0819937520563494,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.4098185699039485,
"grad_norm": 0.1688138649146096,
"learning_rate": 3.660307574603588e-08,
"loss": 0.0,
"num_tokens": 161173076.0,
"reward": 0.80859375,
"reward_std": 0.04221830889582634,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 1032
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 4.481491415916831e-09,
"advantages/std": 0.4675844609737396,
"advantages/var": 0.21863522814410263,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.414087513340448,
"grad_norm": 0.2795226378853111,
"learning_rate": 3.6102652093802974e-08,
"loss": -0.0,
"num_tokens": 161313095.0,
"reward": 0.71875,
"reward_std": 0.0974610298871994,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1033
},
{
"advantages/mean": -4.6566128730773926e-09,
"advantages/snr": 8.4927717217003e-09,
"advantages/std": 0.5483030676841736,
"advantages/var": 0.30063625403187544,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.418356456776948,
"grad_norm": 0.28811601274226684,
"learning_rate": 3.560554467920096e-08,
"loss": 0.0,
"num_tokens": 161456325.0,
"reward": 0.80078125,
"reward_std": 0.1465141326189041,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 1034
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.065668450848788e-09,
"advantages/std": 0.5726749300956726,
"advantages/var": 0.3279565755600835,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.422625400213447,
"grad_norm": 0.24685666805511652,
"learning_rate": 3.5111757055874326e-08,
"loss": 0.0,
"num_tokens": 161608634.0,
"reward": 0.7265625,
"reward_std": 0.14966705441474915,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 1035
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.218797108153148e-09,
"advantages/std": 0.5483075380325317,
"advantages/var": 0.30064115626329624,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 4.426894343649947,
"grad_norm": 0.3321275267280128,
"learning_rate": 3.4621292753735765e-08,
"loss": 0.0,
"num_tokens": 161769820.0,
"reward": 0.68359375,
"reward_std": 0.15163899958133698,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 1036
},
{
"advantages/mean": -5.122274160385132e-09,
"advantages/snr": 1.0954451680481181e-08,
"advantages/std": 0.46759748458862305,
"advantages/var": 0.21864740759360757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.431163287086446,
"grad_norm": 0.3842088085646925,
"learning_rate": 3.413415527894059e-08,
"loss": 0.0,
"num_tokens": 161910800.0,
"reward": 0.84375,
"reward_std": 0.10942068696022034,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"step": 1037
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 5.3229534351711075e-09,
"advantages/std": 0.4374087452888489,
"advantages/var": 0.19132641045516507,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 4.435432230522945,
"grad_norm": 0.21771886571873586,
"learning_rate": 3.365034811386186e-08,
"loss": 0.0,
"num_tokens": 162069764.0,
"reward": 0.68359375,
"reward_std": 0.1052069365978241,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"step": 1038
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4083616409231023e-09,
"advantages/std": 0.49596062302589417,
"advantages/var": 0.2459769395922331,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.439701173959445,
"grad_norm": 0.28642282646933764,
"learning_rate": 3.316987471706556e-08,
"loss": -0.0,
"num_tokens": 162225218.0,
"reward": 0.71875,
"reward_std": 0.12217670679092407,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1039
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.97115066346681e-09,
"advantages/std": 0.4675883650779724,
"advantages/var": 0.2186388791562912,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.443970117395945,
"grad_norm": 0.25478303447053396,
"learning_rate": 3.269273852328547e-08,
"loss": -0.0,
"num_tokens": 162372123.0,
"reward": 0.76953125,
"reward_std": 0.10087841749191284,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1040
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.3970688057076e-09,
"advantages/std": 0.5483095049858093,
"advantages/var": 0.30064331325778326,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 4.448239060832444,
"grad_norm": 0.30279611286008074,
"learning_rate": 3.2218942943399105e-08,
"loss": 0.0,
"num_tokens": 162533746.0,
"reward": 0.7109375,
"reward_std": 0.15452352166175842,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 1041
},
{
"advantages/mean": 3.958120942115784e-09,
"advantages/snr": 6.640369898388981e-09,
"advantages/std": 0.5960693359375,
"advantages/var": 0.35529865324497223,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.452508004268943,
"grad_norm": 0.2731987958255392,
"learning_rate": 3.174849136440294e-08,
"loss": -0.0,
"num_tokens": 162687667.0,
"reward": 0.77734375,
"reward_std": 0.17491313815116882,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"step": 1042
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.98068239603254e-09,
"advantages/std": 0.4959627091884613,
"advantages/var": 0.24597900890555824,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.456776947705443,
"grad_norm": 0.2104365270449523,
"learning_rate": 3.128138714938855e-08,
"loss": 0.0,
"num_tokens": 162834651.0,
"reward": 0.76953125,
"reward_std": 0.12335620820522308,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1043
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.439373903985093e-09,
"advantages/std": 0.5726813077926636,
"advantages/var": 0.3279638802951155,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.461045891141943,
"grad_norm": 0.3192273891677802,
"learning_rate": 3.081763363751844e-08,
"loss": 0.0,
"num_tokens": 162987603.0,
"reward": 0.703125,
"reward_std": 0.15650182962417603,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 1044
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.234946226282646e-09,
"advantages/std": 0.5227998495101929,
"advantages/var": 0.2733196826478803,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.465314834578442,
"grad_norm": 0.2526000182994563,
"learning_rate": 3.035723414400176e-08,
"loss": -0.0,
"num_tokens": 163145350.0,
"reward": 0.74609375,
"reward_std": 0.14848363399505615,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 1045
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.6722020691798937e-09,
"advantages/std": 0.5227837562561035,
"advantages/var": 0.27330285580524105,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.4695837780149414,
"grad_norm": 0.32390884070268056,
"learning_rate": 2.990019196007154e-08,
"loss": 0.0,
"num_tokens": 163305062.0,
"reward": 0.69140625,
"reward_std": 0.13151532411575317,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 1046
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 7.041755735282429e-09,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 4.473852721451441,
"grad_norm": 0.26562549241892064,
"learning_rate": 2.9446510352959918e-08,
"loss": 0.0,
"num_tokens": 163472473.0,
"reward": 0.63671875,
"reward_std": 0.1255941092967987,
"rewards/drgrpo_math_reward/mean": 0.63671875,
"rewards/drgrpo_math_reward/std": 0.48188701272010803,
"step": 1047
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 5.749659422971484e-10,
"advantages/std": 0.40494683384895325,
"advantages/var": 0.16398193824429175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.47812166488794,
"grad_norm": 0.29683929045193824,
"learning_rate": 2.8996192565876042e-08,
"loss": -0.0,
"num_tokens": 163597754.0,
"reward": 0.7578125,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 1048
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.0655749347956924e-10,
"advantages/std": 0.572688102722168,
"advantages/var": 0.3279716629995164,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.482390608324439,
"grad_norm": 0.3116239712728128,
"learning_rate": 2.8549241817982017e-08,
"loss": -0.0,
"num_tokens": 163745480.0,
"reward": 0.70703125,
"reward_std": 0.16557207703590393,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 1049
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 5.749631495215814e-10,
"advantages/std": 0.40494880080223083,
"advantages/var": 0.16398353127116483,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.48665955176094,
"grad_norm": 0.19240322344362223,
"learning_rate": 2.8105661304370253e-08,
"loss": -0.0,
"num_tokens": 163889341.0,
"reward": 0.828125,
"reward_std": 0.08048880845308304,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 1050
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 6.920088884979619e-09,
"advantages/std": 0.4373929798603058,
"advantages/var": 0.19131261883107786,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.490928495197439,
"grad_norm": 0.2267817838396342,
"learning_rate": 2.766545419604066e-08,
"loss": -0.0,
"num_tokens": 164040469.0,
"reward": 0.78515625,
"reward_std": 0.09100939333438873,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 1051
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.129230843823993e-09,
"advantages/std": 0.4373985826969147,
"advantages/var": 0.1913175201452697,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.495197438633938,
"grad_norm": 0.2770427223945021,
"learning_rate": 2.722862363987749e-08,
"loss": 0.0,
"num_tokens": 164193848.0,
"reward": 0.51171875,
"reward_std": 0.096134252846241,
"rewards/drgrpo_math_reward/mean": 0.51171875,
"rewards/drgrpo_math_reward/std": 0.5008418560028076,
"step": 1052
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.680390924281249e-09,
"advantages/std": 0.5227926969528198,
"advantages/var": 0.2733122039872029,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.4994663820704375,
"grad_norm": 0.2707090909721515,
"learning_rate": 2.6795172758627584e-08,
"loss": 0.0,
"num_tokens": 164344284.0,
"reward": 0.78125,
"reward_std": 0.13952963054180145,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 1053
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.347276877087511e-09,
"advantages/std": 0.4959590435028076,
"advantages/var": 0.24597537283221982,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.503735325506937,
"grad_norm": 0.21248376510657957,
"learning_rate": 2.636510465087771e-08,
"loss": 0.0,
"num_tokens": 164496584.0,
"reward": 0.640625,
"reward_std": 0.11993881314992905,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 1054
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.259648064183464e-09,
"advantages/std": 0.3696756958961487,
"advantages/var": 0.1366601201363018,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 4.508004268943436,
"grad_norm": 0.1704470192455297,
"learning_rate": 2.5938422391032055e-08,
"loss": 0.0,
"num_tokens": 164636662.0,
"reward": 0.79296875,
"reward_std": 0.07456512749195099,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 1055
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.680342941005758e-09,
"advantages/std": 0.5227964520454407,
"advantages/var": 0.27331613027130075,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.512273212379936,
"grad_norm": 0.29609821089912336,
"learning_rate": 2.5515129029290984e-08,
"loss": 0.0,
"num_tokens": 164779163.0,
"reward": 0.71875,
"reward_std": 0.1434774398803711,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1056
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.397123829683448e-09,
"advantages/std": 0.5483006238937378,
"advantages/var": 0.3006335741622621,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.516542155816436,
"grad_norm": 0.3321857749921581,
"learning_rate": 2.5095227591628466e-08,
"loss": -0.0,
"num_tokens": 164935062.0,
"reward": 0.78125,
"reward_std": 0.1442737877368927,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 1057
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 3.4856073226994137e-09,
"advantages/std": 0.4675840735435486,
"advantages/var": 0.21863486583157865,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.520811099252935,
"grad_norm": 0.2390333733297495,
"learning_rate": 2.467872107977098e-08,
"loss": 0.0,
"num_tokens": 165097134.0,
"reward": 0.61328125,
"reward_std": 0.0969306156039238,
"rewards/drgrpo_math_reward/mean": 0.61328125,
"rewards/drgrpo_math_reward/std": 0.4879522919654846,
"step": 1058
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 5.633372091520215e-09,
"advantages/std": 0.49596717953681946,
"advantages/var": 0.2459834431777077,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.525080042689434,
"grad_norm": 0.2169048732862071,
"learning_rate": 2.4265612471176032e-08,
"loss": -0.0,
"num_tokens": 165250060.0,
"reward": 0.7734375,
"reward_std": 0.12953945994377136,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 1059
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 6.29855717175748e-09,
"advantages/std": 0.36965709924697876,
"advantages/var": 0.1366463710236907,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.5293489861259335,
"grad_norm": 0.23679888686417735,
"learning_rate": 2.3855904719010443e-08,
"loss": 0.0,
"num_tokens": 165392113.0,
"reward": 0.76171875,
"reward_std": 0.060367584228515625,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 1060
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.0656379836078813e-10,
"advantages/std": 0.5726792216300964,
"advantages/var": 0.3279614908868531,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.533617929562434,
"grad_norm": 0.23892417050547263,
"learning_rate": 2.3449600752129596e-08,
"loss": 0.0,
"num_tokens": 165549716.0,
"reward": 0.734375,
"reward_std": 0.154791921377182,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"step": 1061
},
{
"advantages/mean": -5.587935447692871e-09,
"advantages/snr": 1.0191165426936045e-08,
"advantages/std": 0.548311710357666,
"advantages/var": 0.30064573171534903,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.537886872998933,
"grad_norm": 0.28337521087616024,
"learning_rate": 2.304670347505655e-08,
"loss": 0.0,
"num_tokens": 165699524.0,
"reward": 0.7578125,
"reward_std": 0.15623344480991364,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 1062
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.6262474074661002e-09,
"advantages/std": 0.5726819634437561,
"advantages/var": 0.3279646312537956,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.542155816435432,
"grad_norm": 0.2737285430712695,
"learning_rate": 2.264721576796108e-08,
"loss": -0.0,
"num_tokens": 165850014.0,
"reward": 0.8046875,
"reward_std": 0.15926769375801086,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"step": 1063
},
{
"advantages/mean": -3.4924596548080444e-09,
"advantages/snr": 6.680435099969692e-09,
"advantages/std": 0.5227892398834229,
"advantages/var": 0.27330858933788704,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.546424759871932,
"grad_norm": 0.34394186222827744,
"learning_rate": 2.2251140486639063e-08,
"loss": 0.0,
"num_tokens": 166011272.0,
"reward": 0.58984375,
"reward_std": 0.13611222803592682,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"step": 1064
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.4937803921789752e-09,
"advantages/std": 0.46760013699531555,
"advantages/var": 0.21864988811803787,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.550693703308431,
"grad_norm": 0.24316829786415944,
"learning_rate": 2.1858480462492278e-08,
"loss": -0.0,
"num_tokens": 166167939.0,
"reward": 0.6796875,
"reward_std": 0.11283563077449799,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 1065
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 6.298247429400683e-09,
"advantages/std": 0.36967527866363525,
"advantages/var": 0.13665981165503638,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 4.554962646744931,
"grad_norm": 0.2219964251372802,
"learning_rate": 2.1469238502507926e-08,
"loss": 0.0,
"num_tokens": 166314808.0,
"reward": 0.8203125,
"reward_std": 0.07403472065925598,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"step": 1066
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.694562781299176e-10,
"advantages/std": 0.4959580898284912,
"advantages/var": 0.24597442686632576,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 4.55923159018143,
"grad_norm": 0.3653896044998966,
"learning_rate": 2.1083417389238855e-08,
"loss": 0.0,
"num_tokens": 166455155.0,
"reward": 0.6640625,
"reward_std": 0.11876175552606583,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 1067
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.439362225104603e-09,
"advantages/std": 0.5726840496063232,
"advantages/var": 0.3279670206734977,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 4.56350053361793,
"grad_norm": 0.269576721593581,
"learning_rate": 2.070101988078332e-08,
"loss": -0.0,
"num_tokens": 166620743.0,
"reward": 0.640625,
"reward_std": 0.16097761690616608,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"step": 1068
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 5.038778708786431e-09,
"advantages/std": 0.36966201663017273,
"advantages/var": 0.1366500065390861,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.567769477054429,
"grad_norm": 0.147475285462347,
"learning_rate": 2.0322048710765483e-08,
"loss": -0.0,
"num_tokens": 166783154.0,
"reward": 0.6796875,
"reward_std": 0.06378498673439026,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 1069
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.4373958706855774,
"advantages/var": 0.19131514769279434,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.572038420490928,
"grad_norm": 0.2517716303588861,
"learning_rate": 1.9946506588315814e-08,
"loss": 0.0,
"num_tokens": 166925860.0,
"reward": 0.75390625,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 1070
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 8.131242115120482e-10,
"advantages/std": 0.5726816058158875,
"advantages/var": 0.3279642216398635,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 4.576307363927428,
"grad_norm": 0.3634852172155903,
"learning_rate": 1.9574396198051958e-08,
"loss": 0.0,
"num_tokens": 167073133.0,
"reward": 0.76953125,
"reward_std": 0.15703225135803223,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1071
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.2997950517032087e-09,
"advantages/std": 0.40495893359184265,
"advantages/var": 0.16399173789584243,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.580576307363927,
"grad_norm": 0.24460457844348826,
"learning_rate": 1.920572020005884e-08,
"loss": -0.0,
"num_tokens": 167213513.0,
"reward": 0.7421875,
"reward_std": 0.08850065618753433,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"step": 1072
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.1639892700339495e-09,
"advantages/std": 0.49596095085144043,
"advantages/var": 0.2459772647694649,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.584845250800427,
"grad_norm": 0.25322361819317524,
"learning_rate": 1.8840481229870643e-08,
"loss": -0.0,
"num_tokens": 167346389.0,
"reward": 0.79296875,
"reward_std": 0.12270711362361908,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 1073
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.755668279409157e-09,
"advantages/std": 0.4959557056427002,
"advantages/var": 0.24597206195954868,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.5891141942369265,
"grad_norm": 0.25045970534297207,
"learning_rate": 1.84786818984512e-08,
"loss": 0.0,
"num_tokens": 167490353.0,
"reward": 0.7734375,
"reward_std": 0.11705183982849121,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 1074
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.008270656855743e-09,
"advantages/std": 0.5227879881858826,
"advantages/var": 0.2733072805914425,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 4.593383137673426,
"grad_norm": 0.2542750259013028,
"learning_rate": 1.8120324792175567e-08,
"loss": 0.0,
"num_tokens": 167654451.0,
"reward": 0.6796875,
"reward_std": 0.1344047635793686,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 1075
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.3360743766432835e-09,
"advantages/std": 0.5227941870689392,
"advantages/var": 0.273313762033073,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.597652081109925,
"grad_norm": 0.2591197084066371,
"learning_rate": 1.776541247281177e-08,
"loss": 0.0,
"num_tokens": 167808715.0,
"reward": 0.6953125,
"reward_std": 0.1417675018310547,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 1076
},
{
"advantages/mean": 4.190951585769653e-09,
"advantages/snr": 8.450023102116827e-09,
"advantages/std": 0.4959692358970642,
"advantages/var": 0.24598548295631772,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.601921024546424,
"grad_norm": 0.2634345872404297,
"learning_rate": 1.7413947477501913e-08,
"loss": 0.0,
"num_tokens": 167956325.0,
"reward": 0.71484375,
"reward_std": 0.13071896135807037,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"step": 1077
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.6946234331898e-10,
"advantages/std": 0.49595168232917786,
"advantages/var": 0.24596807120514175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.606189967982925,
"grad_norm": 0.30768554301495155,
"learning_rate": 1.7065932318744702e-08,
"loss": -0.0,
"num_tokens": 168099570.0,
"reward": 0.75,
"reward_std": 0.11310402303934097,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"step": 1078
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.5624132598511334e-09,
"advantages/std": 0.5960795283317566,
"advantages/var": 0.3553108040962094,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 4.610458911419424,
"grad_norm": 0.3892376803698612,
"learning_rate": 1.6721369484377078e-08,
"loss": -0.0,
"num_tokens": 168248423.0,
"reward": 0.69921875,
"reward_std": 0.18687279522418976,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 1079
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 3.872947870535686e-09,
"advantages/std": 0.6612887978553772,
"advantages/var": 0.4373028741690099,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.614727854855923,
"grad_norm": 0.3490531845018821,
"learning_rate": 1.6380261437556662e-08,
"loss": 0.0,
"num_tokens": 168418683.0,
"reward": 0.6875,
"reward_std": 0.2284420132637024,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 1080
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.0646189037466052e-09,
"advantages/std": 0.43739715218544006,
"advantages/var": 0.19131626873993302,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.6189967982924225,
"grad_norm": 0.253860150661586,
"learning_rate": 1.604261061674378e-08,
"loss": 0.0,
"num_tokens": 168557074.0,
"reward": 0.78125,
"reward_std": 0.09442678093910217,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 1081
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4083580865314121e-09,
"advantages/std": 0.49596187472343445,
"advantages/var": 0.24597818117918369,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.623265741728922,
"grad_norm": 0.255409443386475,
"learning_rate": 1.570841943568446e-08,
"loss": 0.0,
"num_tokens": 168708907.0,
"reward": 0.73828125,
"reward_std": 0.1238841712474823,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"step": 1082
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.1175609054645574e-09,
"advantages/std": 0.5227851271629333,
"advantages/var": 0.2733042891827644,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.627534685165422,
"grad_norm": 0.2720860789972887,
"learning_rate": 1.5377690283392975e-08,
"loss": 0.0,
"num_tokens": 168841842.0,
"reward": 0.78515625,
"reward_std": 0.13204818964004517,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"step": 1083
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.987646852646199e-09,
"advantages/std": 0.4675866663455963,
"advantages/var": 0.218637290544188,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.631803628601921,
"grad_norm": 0.23422061530175992,
"learning_rate": 1.505042552413466e-08,
"loss": -0.0,
"num_tokens": 168987902.0,
"reward": 0.76171875,
"reward_std": 0.09864053130149841,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 1084
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 6.181901817442994e-09,
"advantages/std": 0.6402755975723267,
"advantages/var": 0.4099528408466,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 4.636072572038421,
"grad_norm": 0.3098506110740803,
"learning_rate": 1.4726627497409272e-08,
"loss": 0.0,
"num_tokens": 169149174.0,
"reward": 0.69140625,
"reward_std": 0.19530031085014343,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 1085
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5483114123344421,
"advantages/var": 0.3006454048961906,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.64034151547492,
"grad_norm": 0.29092441908057226,
"learning_rate": 1.4406298517934067e-08,
"loss": 0.0,
"num_tokens": 169294764.0,
"reward": 0.74609375,
"reward_std": 0.15570303797721863,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"step": 1086
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 6.387665982680258e-09,
"advantages/std": 0.4374004006385803,
"advantages/var": 0.19131911047879058,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.644610458911419,
"grad_norm": 0.3130123884635255,
"learning_rate": 1.4089440875627356e-08,
"loss": 0.0,
"num_tokens": 169431889.0,
"reward": 0.76953125,
"reward_std": 0.09837214648723602,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1087
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 4.481457711149072e-09,
"advantages/std": 0.46758797764778137,
"advantages/var": 0.2186385168407421,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 4.6488794023479185,
"grad_norm": 0.2311164862473825,
"learning_rate": 1.3776056835592131e-08,
"loss": -0.0,
"num_tokens": 169598500.0,
"reward": 0.5703125,
"reward_std": 0.10034800320863724,
"rewards/drgrpo_math_reward/mean": 0.5703125,
"rewards/drgrpo_math_reward/std": 0.4960011839866638,
"step": 1088
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 5.323110477241685e-09,
"advantages/std": 0.437395840883255,
"advantages/var": 0.19131512162196973,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.653148345784419,
"grad_norm": 0.22020508120163657,
"learning_rate": 1.3466148638099528e-08,
"loss": -0.0,
"num_tokens": 169749373.0,
"reward": 0.84765625,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"step": 1089
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.285228162442336e-09,
"advantages/std": 0.5726901888847351,
"advantages/var": 0.3279740524448336,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.657417289220918,
"grad_norm": 0.2975456107825843,
"learning_rate": 1.3159718498573558e-08,
"loss": -0.0,
"num_tokens": 169893473.0,
"reward": 0.76953125,
"reward_std": 0.16728198528289795,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1090
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.562928780215032e-09,
"advantages/std": 0.5227848291397095,
"advantages/var": 0.2733039775786352,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.661686232657417,
"grad_norm": 0.2822154330392377,
"learning_rate": 1.2856768607574564e-08,
"loss": 0.0,
"num_tokens": 170042017.0,
"reward": 0.765625,
"reward_std": 0.13151776790618896,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"step": 1091
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.0646287690683442e-09,
"advantages/std": 0.43739309906959534,
"advantages/var": 0.19131272311370484,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.665955176093917,
"grad_norm": 0.3146296229451411,
"learning_rate": 1.2557301130783849e-08,
"loss": -0.0,
"num_tokens": 170191746.0,
"reward": 0.69140625,
"reward_std": 0.09271440654993057,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 1092
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 2.5455193155458818e-09,
"advantages/std": 0.6402679681777954,
"advantages/var": 0.40994307107452244,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.670224119530416,
"grad_norm": 0.3272099063255255,
"learning_rate": 1.2261318208988292e-08,
"loss": 0.0,
"num_tokens": 170340786.0,
"reward": 0.75390625,
"reward_std": 0.18505056202411652,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"step": 1093
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.218884999076207e-09,
"advantages/std": 0.5483008623123169,
"advantages/var": 0.3006338356124303,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.674493062966915,
"grad_norm": 0.34863042945311856,
"learning_rate": 1.1968821958064701e-08,
"loss": 0.0,
"num_tokens": 170489979.0,
"reward": 0.76953125,
"reward_std": 0.1448042094707489,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1094
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.4084047181915234e-09,
"advantages/std": 0.49594545364379883,
"advantages/var": 0.2459618929899534,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.678762006403415,
"grad_norm": 0.2538455050726734,
"learning_rate": 1.167981446896521e-08,
"loss": -0.0,
"num_tokens": 170656817.0,
"reward": 0.66796875,
"reward_std": 0.10627168416976929,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 1095
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 9.958592436801074e-10,
"advantages/std": 0.46759748458862305,
"advantages/var": 0.21864740759360757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.683030949839915,
"grad_norm": 0.28690968099611613,
"learning_rate": 1.1394297807701736e-08,
"loss": 0.0,
"num_tokens": 170786325.0,
"reward": 0.7890625,
"reward_std": 0.10942068696022034,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"step": 1096
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.273931268670851e-09,
"advantages/std": 0.5482963919639587,
"advantages/var": 0.3006289334406951,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.687299893276414,
"grad_norm": 0.32286900459011325,
"learning_rate": 1.1112274015331657e-08,
"loss": 0.0,
"num_tokens": 170947675.0,
"reward": 0.76171875,
"reward_std": 0.13967934250831604,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 1097
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.511192125942753e-09,
"advantages/std": 0.49596524238586426,
"advantages/var": 0.24598152165486908,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.691568836712913,
"grad_norm": 0.2753894199046987,
"learning_rate": 1.083374510794305e-08,
"loss": 0.0,
"num_tokens": 171090866.0,
"reward": 0.65234375,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 1098
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.6262492693233955e-09,
"advantages/std": 0.5726813077926636,
"advantages/var": 0.3279638802951155,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 4.695837780149413,
"grad_norm": 0.3435572525051184,
"learning_rate": 1.0558713076640413e-08,
"loss": 0.0,
"num_tokens": 171236588.0,
"reward": 0.78125,
"reward_std": 0.15650184452533722,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"step": 1099
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 2.816709402826297e-09,
"advantages/std": 0.49596306681632996,
"advantages/var": 0.24597936364585937,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.700106723585913,
"grad_norm": 0.2888164459361838,
"learning_rate": 1.0287179887530139e-08,
"loss": 0.0,
"num_tokens": 171383928.0,
"reward": 0.7109375,
"reward_std": 0.12388662248849869,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 1100
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.7814751550759118e-09,
"advantages/std": 0.5227816700935364,
"advantages/var": 0.2733006745857871,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 4.704375667022412,
"grad_norm": 0.34120205918851826,
"learning_rate": 1.0019147481706625e-08,
"loss": 0.0,
"num_tokens": 171541379.0,
"reward": 0.703125,
"reward_std": 0.12863078713417053,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 1101
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.983395592005843e-09,
"advantages/std": 0.46760234236717224,
"advantages/var": 0.21865195058726616,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.75,
"epoch": 4.7086446104589115,
"grad_norm": 0.26555135370742966,
"learning_rate": 9.754617775238561e-09,
"loss": 0.0,
"num_tokens": 171693104.0,
"reward": 0.72265625,
"reward_std": 0.114015132188797,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 1102
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.975343977137235e-09,
"advantages/std": 0.46758273243904114,
"advantages/var": 0.21863361167515993,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.712913553895411,
"grad_norm": 0.252917843314809,
"learning_rate": 9.493592659155002e-09,
"loss": 0.0,
"num_tokens": 171854339.0,
"reward": 0.8125,
"reward_std": 0.09522314369678497,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"step": 1103
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.694503823521619e-10,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.71718249733191,
"grad_norm": 0.27037004134189363,
"learning_rate": 9.236073999431937e-09,
"loss": 0.0,
"num_tokens": 172009884.0,
"reward": 0.81640625,
"reward_std": 0.1255941092967987,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 1104
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 5.944934388860789e-09,
"advantages/std": 0.5483036041259766,
"advantages/var": 0.3006368422975356,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.72145144076841,
"grad_norm": 0.2863127137607284,
"learning_rate": 8.98206363697901e-09,
"loss": 0.0,
"num_tokens": 172153790.0,
"reward": 0.7265625,
"reward_std": 0.14716076850891113,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 1105
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 4.481366596560484e-09,
"advantages/std": 0.46759748458862305,
"advantages/var": 0.21864740759360757,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.72572038420491,
"grad_norm": 0.20646929432145555,
"learning_rate": 8.731563387626096e-09,
"loss": -0.0,
"num_tokens": 172319948.0,
"reward": 0.6328125,
"reward_std": 0.10942068696022034,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 1106
},
{
"advantages/mean": -3.026798367500305e-09,
"advantages/snr": 5.789693917499295e-09,
"advantages/std": 0.5227907299995422,
"advantages/var": 0.27331014737345427,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 4.729989327641409,
"grad_norm": 0.3268089437339258,
"learning_rate": 8.484575042110698e-09,
"loss": 0.0,
"num_tokens": 172479337.0,
"reward": 0.76171875,
"reward_std": 0.13835011422634125,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 1107
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1231638491478496e-09,
"advantages/std": 0.5483105778694153,
"advantages/var": 0.3006444898034921,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.734258271077908,
"grad_norm": 0.24190386381838827,
"learning_rate": 8.241100366064902e-09,
"loss": 0.0,
"num_tokens": 172634544.0,
"reward": 0.81640625,
"reward_std": 0.1545259803533554,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 1108
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 3.821694928466129e-09,
"advantages/std": 0.5483105778694153,
"advantages/var": 0.3006444898034921,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.7385272145144075,
"grad_norm": 0.26512176008869315,
"learning_rate": 8.001141100002884e-09,
"loss": -0.0,
"num_tokens": 172786862.0,
"reward": 0.81640625,
"reward_std": 0.1545259654521942,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 1109
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.016666532322234e-09,
"advantages/std": 0.5227798223495483,
"advantages/var": 0.2732987426558253,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.742796157950907,
"grad_norm": 0.30151062432531645,
"learning_rate": 7.764698959308313e-09,
"loss": 0.0,
"num_tokens": 172940709.0,
"reward": 0.81640625,
"reward_std": 0.1258624941110611,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"step": 1110
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.8894634138202136e-09,
"advantages/std": 0.3696773946285248,
"advantages/var": 0.13666137609933404,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.747065101387407,
"grad_norm": 0.15944846996665116,
"learning_rate": 7.531775634222137e-09,
"loss": -0.0,
"num_tokens": 173076717.0,
"reward": 0.8359375,
"reward_std": 0.07627260684967041,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"step": 1111
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 2.661563943316494e-09,
"advantages/std": 0.4373944103717804,
"advantages/var": 0.19131387022447743,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.751334044823906,
"grad_norm": 0.2350230283206485,
"learning_rate": 7.302372789830702e-09,
"loss": 0.0,
"num_tokens": 173224124.0,
"reward": 0.7265625,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 1112
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.5483001470565796,
"advantages/var": 0.3006330512622668,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.755602988260406,
"grad_norm": 0.2799137495336964,
"learning_rate": 7.076492066053486e-09,
"loss": 0.0,
"num_tokens": 173368075.0,
"reward": 0.76171875,
"reward_std": 0.14203834533691406,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 1113
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 3.4854364888597375e-09,
"advantages/std": 0.4676069915294647,
"advantages/var": 0.2186562985272369,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.759871931696905,
"grad_norm": 0.23008597518051577,
"learning_rate": 6.854135077631773e-09,
"loss": 0.0,
"num_tokens": 173517174.0,
"reward": 0.828125,
"reward_std": 0.11849337071180344,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 1114
},
{
"advantages/mean": -5.820766091346741e-09,
"advantages/snr": 9.765177591033836e-09,
"advantages/std": 0.5960737466812134,
"advantages/var": 0.35530391148257934,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.764140875133404,
"grad_norm": 0.2973781606816029,
"learning_rate": 6.6353034141168325e-09,
"loss": 0.0,
"num_tokens": 173677131.0,
"reward": 0.6953125,
"reward_std": 0.18056842684745789,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"step": 1115
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 1.0113837549730904e-08,
"advantages/std": 0.4373989701271057,
"advantages/var": 0.19131785906825272,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.796875,
"epoch": 4.7684098185699035,
"grad_norm": 0.20405016184291036,
"learning_rate": 6.419998639858537e-09,
"loss": -0.0,
"num_tokens": 173839238.0,
"reward": 0.671875,
"reward_std": 0.0966646745800972,
"rewards/drgrpo_math_reward/mean": 0.671875,
"rewards/drgrpo_math_reward/std": 0.47045037150382996,
"step": 1116
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.36966416239738464,
"advantages/var": 0.13665159296095997,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.772678762006404,
"grad_norm": 0.19423563422651047,
"learning_rate": 6.208222293994425e-09,
"loss": 0.0,
"num_tokens": 173969936.0,
"reward": 0.890625,
"reward_std": 0.06602286547422409,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"step": 1117
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.3360912852746437e-09,
"advantages/std": 0.5227875709533691,
"advantages/var": 0.273306844343324,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.776947705442903,
"grad_norm": 0.2424510021874874,
"learning_rate": 5.999975890438435e-09,
"loss": -0.0,
"num_tokens": 174132488.0,
"reward": 0.67578125,
"reward_std": 0.13546313345432281,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"step": 1118
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.336079555818232e-09,
"advantages/std": 0.5227921605110168,
"advantages/var": 0.2733116430917768,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.781216648879402,
"grad_norm": 0.36431189368295286,
"learning_rate": 5.795260917870359e-09,
"loss": 0.0,
"num_tokens": 174298944.0,
"reward": 0.66796875,
"reward_std": 0.13888297975063324,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"step": 1119
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.5969250913959587e-09,
"advantages/std": 0.4373980462551117,
"advantages/var": 0.19131705086778883,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 4.785485592315902,
"grad_norm": 0.2691325592767996,
"learning_rate": 5.594078839724792e-09,
"loss": -0.0,
"num_tokens": 174460048.0,
"reward": 0.71875,
"reward_std": 0.09719263762235641,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1120
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 6.1027632923521535e-09,
"advantages/std": 0.49597176909446716,
"advantages/var": 0.24598799573869545,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 4.789754535752401,
"grad_norm": 0.27735708577103635,
"learning_rate": 5.396431094181197e-09,
"loss": -0.0,
"num_tokens": 174608479.0,
"reward": 0.73046875,
"reward_std": 0.13413390517234802,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 1121
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.235163042765691e-09,
"advantages/std": 0.5227816700935364,
"advantages/var": 0.2733006745857871,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.794023479188901,
"grad_norm": 0.27998724553881144,
"learning_rate": 5.202319094153252e-09,
"loss": 0.0,
"num_tokens": 174767612.0,
"reward": 0.7109375,
"reward_std": 0.12863078713417053,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"step": 1122
},
{
"advantages/mean": 2.7939677238464355e-09,
"advantages/snr": 5.9749955192391796e-09,
"advantages/std": 0.4676100015640259,
"advantages/var": 0.21865911356270828,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.7982924226254005,
"grad_norm": 0.2236879992519063,
"learning_rate": 5.011744227278625e-09,
"loss": 0.0,
"num_tokens": 174924408.0,
"reward": 0.69140625,
"reward_std": 0.12243872880935669,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"step": 1123
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.698561914841724e-09,
"advantages/std": 0.5483006238937378,
"advantages/var": 0.3006335741622621,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 4.8025613660619,
"grad_norm": 0.28551119505496925,
"learning_rate": 4.824707855909605e-09,
"loss": -0.0,
"num_tokens": 175075666.0,
"reward": 0.71875,
"reward_std": 0.1442737877368927,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1124
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.4536934733273464e-10,
"advantages/std": 0.5227810144424438,
"advantages/var": 0.2732999890614707,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.806830309498399,
"grad_norm": 0.2896090207354788,
"learning_rate": 4.641211317102822e-09,
"loss": -0.0,
"num_tokens": 175218993.0,
"reward": 0.7734375,
"reward_std": 0.12756997346878052,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 1125
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.9752719972572655e-09,
"advantages/std": 0.4675883650779724,
"advantages/var": 0.2186388791562912,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.811099252934898,
"grad_norm": 0.2796216225903626,
"learning_rate": 4.461255922609985e-09,
"loss": 0.0,
"num_tokens": 175350083.0,
"reward": 0.87109375,
"reward_std": 0.10087842494249344,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"step": 1126
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.4896779409655654e-09,
"advantages/std": 0.46759188175201416,
"advantages/var": 0.2186421678803896,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.815368196371399,
"grad_norm": 0.23411543424994902,
"learning_rate": 4.284842958868329e-09,
"loss": -0.0,
"num_tokens": 175494985.0,
"reward": 0.76953125,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1127
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.477180109708889e-09,
"advantages/std": 0.46760138869285583,
"advantages/var": 0.21865105870748724,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.819637139807898,
"grad_norm": 0.28312151408955855,
"learning_rate": 4.111973686991676e-09,
"loss": 0.0,
"num_tokens": 175640384.0,
"reward": 0.76171875,
"reward_std": 0.11283808946609497,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"step": 1128
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 7.1258372493495695e-09,
"advantages/std": 0.5227863192558289,
"advantages/var": 0.2733055356010574,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.823906083244397,
"grad_norm": 0.2462476751541139,
"learning_rate": 3.9426493427611175e-09,
"loss": 0.0,
"num_tokens": 175784391.0,
"reward": 0.71875,
"reward_std": 0.1337556689977646,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1129
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.23379866778850555,
"advantages/var": 0.054661817059679985,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.8281750266808965,
"grad_norm": 0.17545168464173302,
"learning_rate": 3.776871136616289e-09,
"loss": -0.0,
"num_tokens": 175921949.0,
"reward": 0.79296875,
"reward_std": 0.02775236964225769,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 1130
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 7.812195974509691e-10,
"advantages/std": 0.5960696339607239,
"advantages/var": 0.35529900853007135,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.832443970117396,
"grad_norm": 0.322489514678151,
"learning_rate": 3.614640253646828e-09,
"loss": -0.0,
"num_tokens": 176092087.0,
"reward": 0.6796875,
"reward_std": 0.17544355988502502,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 1131
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 6.5454632784124295e-09,
"advantages/std": 0.6402834057807922,
"advantages/var": 0.40996283971825065,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.836712913553895,
"grad_norm": 0.3875750637098193,
"learning_rate": 3.4559578535837685e-09,
"loss": 0.0,
"num_tokens": 176254779.0,
"reward": 0.6328125,
"reward_std": 0.20437544584274292,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 1132
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.694570397962185e-10,
"advantages/std": 0.49595728516578674,
"advantages/var": 0.2459736287090175,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.840981856990394,
"grad_norm": 0.3219642366491413,
"learning_rate": 3.3008250707913242e-09,
"loss": 0.0,
"num_tokens": 176420160.0,
"reward": 0.6875,
"reward_std": 0.11928972601890564,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"step": 1133
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.8747867624157402e-09,
"advantages/std": 0.40495288372039795,
"advantages/var": 0.16398683803346614,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.78125,
"epoch": 4.845250800426895,
"grad_norm": 0.22062801731458345,
"learning_rate": 3.14924301425884e-09,
"loss": -0.0,
"num_tokens": 176576177.0,
"reward": 0.703125,
"reward_std": 0.08337578922510147,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 1134
},
{
"advantages/mean": 4.423782229423523e-09,
"advantages/snr": 9.460670050669734e-09,
"advantages/std": 0.4675971269607544,
"advantages/var": 0.21864707314195186,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.849519743863394,
"grad_norm": 0.21905543535634842,
"learning_rate": 3.00121276759252e-09,
"loss": -0.0,
"num_tokens": 176716311.0,
"reward": 0.79296875,
"reward_std": 0.10889027267694473,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"step": 1135
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 1.991768376879134e-09,
"advantages/std": 0.4675857722759247,
"advantages/var": 0.2186364544348729,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.853788687299893,
"grad_norm": 0.2494609999273987,
"learning_rate": 2.856735389008269e-09,
"loss": -0.0,
"num_tokens": 176874861.0,
"reward": 0.65234375,
"reward_std": 0.09916849434375763,
"rewards/drgrpo_math_reward/mean": 0.65234375,
"rewards/drgrpo_math_reward/std": 0.4771590530872345,
"step": 1136
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.149914704966296e-09,
"advantages/std": 0.40495288372039795,
"advantages/var": 0.16398683803346614,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.8580576307363925,
"grad_norm": 0.23420297830571496,
"learning_rate": 2.7158119113234732e-09,
"loss": -0.0,
"num_tokens": 177010122.0,
"reward": 0.7734375,
"reward_std": 0.08337578922510147,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 1137
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.024806323518198e-09,
"advantages/std": 0.4049423336982727,
"advantages/var": 0.16397829362100325,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 4.862326574172892,
"grad_norm": 0.30097179874734115,
"learning_rate": 2.578443341950176e-09,
"loss": 0.0,
"num_tokens": 177165560.0,
"reward": 0.70703125,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 1138
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.504941560504481e-09,
"advantages/std": 0.5726861953735352,
"advantages/var": 0.3279694783714149,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.866595517609392,
"grad_norm": 0.30906655932357985,
"learning_rate": 2.4446306628875813e-09,
"loss": 0.0,
"num_tokens": 177325268.0,
"reward": 0.65625,
"reward_std": 0.1626875400543213,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"step": 1139
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 2.8167369917440553e-09,
"advantages/std": 0.49595820903778076,
"advantages/var": 0.24597454511196304,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.671875,
"epoch": 4.870864461045891,
"grad_norm": 0.25293915422456575,
"learning_rate": 2.31437483071506e-09,
"loss": -0.0,
"num_tokens": 177501239.0,
"reward": 0.5546875,
"reward_std": 0.12046678364276886,
"rewards/drgrpo_math_reward/mean": 0.5546875,
"rewards/drgrpo_math_reward/std": 0.49797385931015015,
"step": 1140
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 5.0956275813194834e-09,
"advantages/std": 0.5483068823814392,
"advantages/var": 0.3006404372668534,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.875133404482391,
"grad_norm": 0.27334241170458684,
"learning_rate": 2.1876767765853233e-09,
"loss": 0.0,
"num_tokens": 177657926.0,
"reward": 0.70703125,
"reward_std": 0.15057817101478577,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 1141
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.2739222670442577e-09,
"advantages/std": 0.5483002662658691,
"advantages/var": 0.300633181987223,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.87940234791889,
"grad_norm": 0.2710624584587459,
"learning_rate": 2.0645374062179253e-09,
"loss": 0.0,
"num_tokens": 177795708.0,
"reward": 0.80078125,
"reward_std": 0.1437433660030365,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"step": 1142
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 3.5628544429995396e-09,
"advantages/std": 0.5227957367897034,
"advantages/var": 0.2733153824054888,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.883671291355389,
"grad_norm": 0.2565490653909916,
"learning_rate": 1.9449575998924383e-09,
"loss": 0.0,
"num_tokens": 177938848.0,
"reward": 0.828125,
"reward_std": 0.14400538802146912,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 1143
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 4.97924068182451e-10,
"advantages/std": 0.4676026999950409,
"advantages/var": 0.21865228504265222,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.765625,
"epoch": 4.887940234791889,
"grad_norm": 0.22628308527208368,
"learning_rate": 1.8289382124426211e-09,
"loss": 0.0,
"num_tokens": 178102728.0,
"reward": 0.6328125,
"reward_std": 0.1145455539226532,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"step": 1144
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 2.1232125491970075e-09,
"advantages/std": 0.5482980012893677,
"advantages/var": 0.30063069821791544,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.892209178228389,
"grad_norm": 0.2379483920272022,
"learning_rate": 1.7164800732498154e-09,
"loss": -0.0,
"num_tokens": 178254574.0,
"reward": 0.76953125,
"reward_std": 0.14032843708992004,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1145
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 4.878675704298026e-09,
"advantages/std": 0.5726897716522217,
"advantages/var": 0.3279735745550738,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 4.896478121664888,
"grad_norm": 0.35643188938727866,
"learning_rate": 1.6075839862374486e-09,
"loss": 0.0,
"num_tokens": 178401659.0,
"reward": 0.7578125,
"reward_std": 0.16834037005901337,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"step": 1146
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.97100182115522e-09,
"advantages/std": 0.4675983488559723,
"advantages/var": 0.21864821585283156,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.96875,
"epoch": 4.900747065101387,
"grad_norm": 0.24224725482315074,
"learning_rate": 1.5022507298649845e-09,
"loss": 0.0,
"num_tokens": 178553534.0,
"reward": 0.71875,
"reward_std": 0.10889272391796112,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1147
},
{
"advantages/mean": -2.0954757928848267e-09,
"advantages/snr": 6.337630938135611e-09,
"advantages/std": 0.3306402266025543,
"advantages/var": 0.10932295944778847,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.8125,
"epoch": 4.905016008537887,
"grad_norm": 0.1679248225157117,
"learning_rate": 1.4004810571225378e-09,
"loss": 0.0,
"num_tokens": 178707051.0,
"reward": 0.7265625,
"reward_std": 0.05444391071796417,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 1148
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 1.4083695114256918e-09,
"advantages/std": 0.4959578514099121,
"advantages/var": 0.24597419037513646,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.84375,
"epoch": 4.909284951974386,
"grad_norm": 0.21289099107193096,
"learning_rate": 1.30227569552549e-09,
"loss": -0.0,
"num_tokens": 178858493.0,
"reward": 0.73046875,
"reward_std": 0.11993636190891266,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"step": 1149
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.779016395729458e-09,
"advantages/std": 0.3696686327457428,
"advantages/var": 0.13665489803610686,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.913553895410886,
"grad_norm": 0.21853704015214817,
"learning_rate": 1.2076353471089927e-09,
"loss": 0.0,
"num_tokens": 178990574.0,
"reward": 0.921875,
"reward_std": 0.06890985369682312,
"rewards/drgrpo_math_reward/mean": 0.921875,
"rewards/drgrpo_math_reward/std": 0.26889389753341675,
"step": 1150
},
{
"advantages/mean": -4.423782229423523e-09,
"advantages/snr": 1.0114006384407448e-08,
"advantages/std": 0.4373916685581207,
"advantages/var": 0.19131147172405694,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.9178228388473855,
"grad_norm": 0.23268145856156358,
"learning_rate": 1.116560688423418e-09,
"loss": 0.0,
"num_tokens": 179147929.0,
"reward": 0.6796875,
"reward_std": 0.09100693464279175,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 1151
},
{
"advantages/mean": -3.259629011154175e-09,
"advantages/snr": 6.970936510266691e-09,
"advantages/std": 0.4676027297973633,
"advantages/var": 0.21865231291394593,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.922091782283885,
"grad_norm": 0.2005730784728622,
"learning_rate": 1.0290523705291932e-09,
"loss": 0.0,
"num_tokens": 179308414.0,
"reward": 0.703125,
"reward_std": 0.1145455539226532,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"step": 1152
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 4.694553754175022e-09,
"advantages/std": 0.4959590435028076,
"advantages/var": 0.24597537283221982,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.9375,
"epoch": 4.926360725720384,
"grad_norm": 0.29268173253105517,
"learning_rate": 9.45111018992306e-10,
"loss": 0.0,
"num_tokens": 179442130.0,
"reward": 0.828125,
"reward_std": 0.11993882060050964,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"step": 1153
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 1.7814881544564379e-09,
"advantages/std": 0.5227778553962708,
"advantages/var": 0.2732966860927242,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.90625,
"epoch": 4.930629669156883,
"grad_norm": 0.2273045762887075,
"learning_rate": 8.647372338795866e-10,
"loss": -0.0,
"num_tokens": 179593784.0,
"reward": 0.71875,
"reward_std": 0.12468298524618149,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1154
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.1175086559963265e-09,
"advantages/std": 0.5227938890457153,
"advantages/var": 0.2733134504235437,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.934898612593383,
"grad_norm": 0.2420054917145749,
"learning_rate": 7.8793158975482e-10,
"loss": -0.0,
"num_tokens": 179753798.0,
"reward": 0.70703125,
"reward_std": 0.1412370800971985,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"step": 1155
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 6.9283207245813425e-09,
"advantages/std": 0.36966201663017273,
"advantages/var": 0.1366500065390861,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.939167556029883,
"grad_norm": 0.17994478010217374,
"learning_rate": 7.146946356743067e-10,
"loss": 0.0,
"num_tokens": 179901476.0,
"reward": 0.6640625,
"reward_std": 0.06378498673439026,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 1156
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 4.0655376988318266e-10,
"advantages/std": 0.5726933479309082,
"advantages/var": 0.3279776707643123,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.828125,
"epoch": 4.943436499466382,
"grad_norm": 0.3042820187636512,
"learning_rate": 6.450268951830318e-10,
"loss": 0.0,
"num_tokens": 180062992.0,
"reward": 0.6640625,
"reward_std": 0.17069938778877258,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"step": 1157
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 3.2861526764651336e-09,
"advantages/std": 0.49596431851387024,
"advantages/var": 0.24598060523892773,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.734375,
"epoch": 4.9477054429028815,
"grad_norm": 0.23530944447092877,
"learning_rate": 5.789288663110014e-10,
"loss": 0.0,
"num_tokens": 180224794.0,
"reward": 0.72265625,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"step": 1158
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 8.016719545759423e-09,
"advantages/std": 0.5227763652801514,
"advantages/var": 0.27329512809552625,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.951974386339381,
"grad_norm": 0.298315077604583,
"learning_rate": 5.164010215695791e-10,
"loss": 0.0,
"num_tokens": 180364639.0,
"reward": 0.7734375,
"reward_std": 0.12244509160518646,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 1159
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 3.755670987579601e-09,
"advantages/std": 0.49595534801483154,
"advantages/var": 0.24597170722451267,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.95624332977588,
"grad_norm": 0.2191656669611469,
"learning_rate": 4.574438079480991e-10,
"loss": 0.0,
"num_tokens": 180506762.0,
"reward": 0.82421875,
"reward_std": 0.11652141809463501,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"step": 1160
},
{
"advantages/mean": -5.587935447692871e-09,
"advantages/snr": 1.1950243137431801e-08,
"advantages/std": 0.46760013699531555,
"advantages/var": 0.21864988811803787,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.921875,
"epoch": 4.96051227321238,
"grad_norm": 0.18746099127830204,
"learning_rate": 4.020576469108139e-10,
"loss": 0.0,
"num_tokens": 180642341.0,
"reward": 0.8671875,
"reward_std": 0.11283563077449799,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"step": 1161
},
{
"advantages/mean": 3.259629011154175e-09,
"advantages/snr": 5.691755147905178e-09,
"advantages/std": 0.5726931095123291,
"advantages/var": 0.3279773976829006,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.96478121664888,
"grad_norm": 0.2954386105161857,
"learning_rate": 3.5024293439372967e-10,
"loss": -0.0,
"num_tokens": 180813205.0,
"reward": 0.66015625,
"reward_std": 0.17187398672103882,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"step": 1162
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 4.694550651109113e-09,
"advantages/std": 0.4959593713283539,
"advantages/var": 0.245975698008416,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.969050160085379,
"grad_norm": 0.24627764577496902,
"learning_rate": 3.020000408018863e-10,
"loss": 0.0,
"num_tokens": 180953416.0,
"reward": 0.80859375,
"reward_std": 0.12046922743320465,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"step": 1163
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 5.855392400745098e-09,
"advantages/std": 0.4373980164527893,
"advantages/var": 0.19131702479683454,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.953125,
"epoch": 4.973319103521878,
"grad_norm": 0.19705677177548778,
"learning_rate": 2.573293110065822e-10,
"loss": 0.0,
"num_tokens": 181091777.0,
"reward": 0.7265625,
"reward_std": 0.09719263762235641,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"step": 1164
},
{
"advantages/mean": -3.958120942115784e-09,
"advantages/snr": 7.571227974076312e-09,
"advantages/std": 0.5227845311164856,
"advantages/var": 0.2733036659746837,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.984375,
"epoch": 4.9775880469583775,
"grad_norm": 0.31218340104221065,
"learning_rate": 2.1623106434309757e-10,
"loss": 0.0,
"num_tokens": 181237625.0,
"reward": 0.76953125,
"reward_std": 0.13098736107349396,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"step": 1165
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 4.224992848592354e-09,
"advantages/std": 0.33064761757850647,
"advantages/var": 0.10932784701034226,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 4.981856990394878,
"grad_norm": 0.15329749593412742,
"learning_rate": 1.787055946081417e-10,
"loss": -0.0,
"num_tokens": 181372123.0,
"reward": 0.7734375,
"reward_std": 0.059568777680397034,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"step": 1166
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 2.972431973776251e-09,
"advantages/std": 0.5483101010322571,
"advantages/var": 0.30064396689400397,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.986125933831377,
"grad_norm": 0.22789400713755284,
"learning_rate": 1.4475317005802067e-10,
"loss": -0.0,
"num_tokens": 181532071.0,
"reward": 0.6796875,
"reward_std": 0.15558436512947083,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"step": 1167
},
{
"advantages/mean": -3.725290298461914e-09,
"advantages/snr": 6.505066136085465e-09,
"advantages/std": 0.5726752281188965,
"advantages/var": 0.3279569169010301,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.890625,
"epoch": 4.990394877267876,
"grad_norm": 0.2972979244184956,
"learning_rate": 1.1437403340652796e-10,
"loss": 0.0,
"num_tokens": 181682041.0,
"reward": 0.69921875,
"reward_std": 0.15019746124744415,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"step": 1168
},
{
"advantages/mean": -4.190951585769653e-09,
"advantages/snr": 1.1337001660336367e-08,
"advantages/std": 0.36967018246650696,
"advantages/var": 0.13665604380482055,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.859375,
"epoch": 4.994663820704376,
"grad_norm": 0.2458655812447713,
"learning_rate": 8.756840182344571e-11,
"loss": 0.0,
"num_tokens": 181810281.0,
"reward": 0.87109375,
"reward_std": 0.06891229748725891,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"step": 1169
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 7.528047335495492e-10,
"advantages/std": 0.618568480014801,
"advantages/var": 0.3826269644678213,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -2.875,
"epoch": 4.998932764140875,
"grad_norm": 0.32316307035032915,
"learning_rate": 6.433646693265737e-11,
"loss": 0.0,
"num_tokens": 181970221.0,
"reward": 0.71875,
"reward_std": 0.18596169352531433,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"step": 1170
}
],
"logging_steps": 1,
"max_steps": 1175,
"num_input_tokens_seen": 181970221,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}