{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998932764140875, "eval_steps": 500, "global_step": 1170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6185625195503235, "advantages/var": 0.3826195905924443, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 0.004268943436499467, "grad_norm": 0.2276165733924732, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 183035.0, "reward": 0.5859375, "reward_std": 0.17742186784744263, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.4935242533683777, "step": 1 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2525086942753754e-09, "advantages/std": 0.5726795196533203, "advantages/var": 0.3279618322303577, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.008537886872998933, "grad_norm": 0.226357800614887, "learning_rate": 9.999982128386562e-07, "loss": -0.0, "num_tokens": 361873.0, "reward": 0.56640625, "reward_std": 0.1553223431110382, "rewards/drgrpo_math_reward/mean": 0.56640625, "rewards/drgrpo_math_reward/std": 0.4965413510799408, "step": 2 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 7.4215683647132815e-09, "advantages/std": 0.5960710644721985, "advantages/var": 0.3553007139010198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.012806830309498399, "grad_norm": 0.19563640777871955, "learning_rate": 9.999928513674003e-07, "loss": 0.0, "num_tokens": 552736.0, "reward": 0.5390625, "reward_std": 0.17609265446662903, "rewards/drgrpo_math_reward/mean": 0.5390625, "rewards/drgrpo_math_reward/std": 0.4994482398033142, "step": 3 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.906189388529494e-09, "advantages/std": 0.5960556864738464, "advantages/var": 0.3552823813778083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.017075773745997867, "grad_norm": 0.22884223901736953, "learning_rate": 9.999839156245597e-07, "loss": 0.0, "num_tokens": 715125.0, "reward": 0.63671875, "reward_std": 0.1573006510734558, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 4 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.520896926640372e-09, "advantages/std": 0.6612821817398071, "advantages/var": 0.4372941238865593, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.021344717182497332, "grad_norm": 0.22527385543570674, "learning_rate": 9.999714056740128e-07, "loss": -0.0, "num_tokens": 889019.0, "reward": 0.7265625, "reward_std": 0.21778054535388947, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 5 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.171810413586495e-09, "advantages/std": 0.5960792899131775, "advantages/var": 0.3553105198633979, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.390625, "epoch": 0.025613660618996798, "grad_norm": 0.2126144087440947, "learning_rate": 9.99955321605189e-07, "loss": 0.0, "num_tokens": 1074157.0, "reward": 0.5, "reward_std": 0.18634238839149475, "rewards/drgrpo_math_reward/mean": 0.5, "rewards/drgrpo_math_reward/std": 0.5009794235229492, "step": 6 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065588475315301e-09, "advantages/std": 0.5726861953735352, "advantages/var": 0.3279694783714149, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 0.029882604055496264, "grad_norm": 0.1897445781841252, "learning_rate": 9.999356635330673e-07, "loss": -0.0, "num_tokens": 1234275.0, "reward": 0.7265625, "reward_std": 0.1626875400543213, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 7 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 4.5771540403460145e-09, "advantages/std": 0.6612839102745056, "advantages/var": 0.4372964099879404, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.03415154749199573, "grad_norm": 0.23483356980828468, "learning_rate": 9.999124315981764e-07, "loss": 0.0, "num_tokens": 1398228.0, "reward": 0.671875, "reward_std": 0.2210792601108551, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 8 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 7.99997790566197e-09, "advantages/std": 0.6402860283851624, "advantages/var": 0.40996619814524493, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.296875, "epoch": 0.0384204909284952, "grad_norm": 0.3110548918586148, "learning_rate": 9.998856259665933e-07, "loss": -0.0, "num_tokens": 1578412.0, "reward": 0.5625, "reward_std": 0.20726242661476135, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49705013632774353, "step": 9 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 8.917455869309804e-09, "advantages/std": 0.5483002662658691, "advantages/var": 0.300633181987223, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.042689434364994665, "grad_norm": 1.5465370680042207, "learning_rate": 9.99855246829942e-07, "loss": 0.0, "num_tokens": 1733286.0, "reward": 0.76171875, "reward_std": 0.1437433660030365, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 10 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.516702186066849e-09, "advantages/std": 0.6185857653617859, "advantages/var": 0.3826483491082264, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "epoch": 0.04695837780149413, "grad_norm": 0.2519428429352001, "learning_rate": 9.998212944053918e-07, "loss": 0.0, "num_tokens": 1918742.0, "reward": 0.51171875, "reward_std": 0.20752444863319397, "rewards/drgrpo_math_reward/mean": 0.51171875, "rewards/drgrpo_math_reward/std": 0.5008418560028076, "step": 11 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.337529925466492e-09, "advantages/std": 0.6612910032272339, "advantages/var": 0.43730579094928146, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.051227321237993596, "grad_norm": 0.2167273292177543, "learning_rate": 9.997837689356569e-07, "loss": 0.0, "num_tokens": 2090431.0, "reward": 0.65234375, "reward_std": 0.23079858720302582, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 12 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 1.0163927816352007e-08, "advantages/std": 0.572688639163971, "advantages/var": 0.3279722774274809, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 0.05549626467449306, "grad_norm": 0.2485293884209483, "learning_rate": 9.997426706889933e-07, "loss": 0.0, "num_tokens": 2244232.0, "reward": 0.64453125, "reward_std": 0.16663289070129395, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 13 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814483444527136e-09, "advantages/std": 0.5227895379066467, "advantages/var": 0.2733089009446452, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.05976520811099253, "grad_norm": 0.20260901830215677, "learning_rate": 9.99697999959198e-07, "loss": -0.0, "num_tokens": 2400663.0, "reward": 0.7109375, "reward_std": 0.13664263486862183, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 14 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.6363342984850635e-09, "advantages/std": 0.6402894258499146, "advantages/var": 0.4099705488552132, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 0.064034151547492, "grad_norm": 0.25104401322218795, "learning_rate": 9.996497570656062e-07, "loss": 0.0, "num_tokens": 2578690.0, "reward": 0.63671875, "reward_std": 0.21344566345214844, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 15 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.123630759509537e-09, "advantages/std": 0.6816376447677612, "advantages/var": 0.46462987876454065, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "epoch": 0.06830309498399147, "grad_norm": 0.2337666130483442, "learning_rate": 9.995979423530892e-07, "loss": 0.0, "num_tokens": 2759120.0, "reward": 0.640625, "reward_std": 0.23778307437896729, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 16 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.149247399126245e-10, "advantages/std": 0.7393215298652649, "advantages/var": 0.5465963245223158, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 0.07257203842049093, "grad_norm": 0.3111236732518641, "learning_rate": 9.99542556192052e-07, "loss": 0.0, "num_tokens": 2941175.0, "reward": 0.58984375, "reward_std": 0.251722514629364, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 17 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.906080019985982e-10, "advantages/std": 0.5960723757743835, "advantages/var": 0.3553022771613179, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.0768409818569904, "grad_norm": 0.19984891954596115, "learning_rate": 9.994835989784303e-07, "loss": 0.0, "num_tokens": 3108867.0, "reward": 0.6640625, "reward_std": 0.17833054065704346, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 18 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.859250099670972e-09, "advantages/std": 0.5960591435432434, "advantages/var": 0.35528650260150485, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 0.08110992529348986, "grad_norm": 0.18501721563546608, "learning_rate": 9.99421071133689e-07, "loss": 0.0, "num_tokens": 3286939.0, "reward": 0.61328125, "reward_std": 0.1629534810781479, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 19 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.520419240832458e-09, "advantages/std": 0.5482913851737976, "advantages/var": 0.3006234430558017, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.484375, "epoch": 0.08537886872998933, "grad_norm": 0.17419719340312353, "learning_rate": 9.993549731048169e-07, "loss": 0.0, "num_tokens": 3454477.0, "reward": 0.66015625, "reward_std": 0.13349363207817078, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 20 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8819951499331522e-09, "advantages/std": 0.6185739636421204, "advantages/var": 0.38263374849592324, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.08964781216648879, "grad_norm": 0.27585718695020495, "learning_rate": 9.992853053643257e-07, "loss": -0.0, "num_tokens": 3623839.0, "reward": 0.55078125, "reward_std": 0.19385483860969543, "rewards/drgrpo_math_reward/mean": 0.55078125, "rewards/drgrpo_math_reward/std": 0.49838894605636597, "step": 21 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691813202266692e-09, "advantages/std": 0.5726872682571411, "advantages/var": 0.3279707072238267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "epoch": 0.09391675560298826, "grad_norm": 0.22555380621343457, "learning_rate": 9.992120684102452e-07, "loss": -0.0, "num_tokens": 3800102.0, "reward": 0.63671875, "reward_std": 0.16439500451087952, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 22 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.363669808468423e-09, "advantages/std": 0.6402793526649475, "advantages/var": 0.4099576494490442, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.09818569903948772, "grad_norm": 0.255857672328258, "learning_rate": 9.991352627661204e-07, "loss": -0.0, "num_tokens": 3951987.0, "reward": 0.73046875, "reward_std": 0.19872015714645386, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 23 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 8.068186637020806e-09, "advantages/std": 0.5482994318008423, "advantages/var": 0.3006322669131265, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.359375, "epoch": 0.10245464247598719, "grad_norm": 0.21824678675243928, "learning_rate": 9.990548889810077e-07, "loss": 0.0, "num_tokens": 4134083.0, "reward": 0.61328125, "reward_std": 0.14256632328033447, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 24 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.7326250551152195e-09, "advantages/std": 0.6816321611404419, "advantages/var": 0.46462240310098935, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.10672358591248667, "grad_norm": 0.24711610053127914, "learning_rate": 9.989709476294707e-07, "loss": -0.0, "num_tokens": 4300726.0, "reward": 0.62890625, "reward_std": 0.22882908582687378, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 25 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 7.151522306503461e-09, "advantages/std": 0.618579089641571, "advantages/var": 0.3826400901417948, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 0.11099252934898612, "grad_norm": 0.2287106287368787, "learning_rate": 9.988834393115767e-07, "loss": 0.0, "num_tokens": 4475968.0, "reward": 0.609375, "reward_std": 0.1990984082221985, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 26 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.225062602237694e-09, "advantages/std": 0.661284327507019, "advantages/var": 0.4372969618064104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.1152614727854856, "grad_norm": 0.28581877552254065, "learning_rate": 9.98792364652891e-07, "loss": 0.0, "num_tokens": 4639754.0, "reward": 0.66796875, "reward_std": 0.22172591090202332, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 27 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.786420986722844e-09, "advantages/std": 0.6185710430145264, "advantages/var": 0.38263013525607903, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 0.11953041622198506, "grad_norm": 0.24270294378064936, "learning_rate": 9.986977243044745e-07, "loss": 0.0, "num_tokens": 4770384.0, "reward": 0.796875, "reward_std": 0.18714365363121033, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 28 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 3.983451559204775e-09, "advantages/std": 0.7013936638832092, "advantages/var": 0.4919530717355123, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 0.12379935965848453, "grad_norm": 0.25235289880629225, "learning_rate": 9.985995189428775e-07, "loss": 0.0, "num_tokens": 4949006.0, "reward": 0.5703125, "reward_std": 0.24488136172294617, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4960011839866638, "step": 29 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.763953668416837e-09, "advantages/std": 0.6185799837112427, "advantages/var": 0.38264119624820125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 0.128068303094984, "grad_norm": 0.26694114427436527, "learning_rate": 9.98497749270135e-07, "loss": -0.0, "num_tokens": 5103697.0, "reward": 0.69140625, "reward_std": 0.19910085201263428, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 30 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5055998919439133e-09, "advantages/std": 0.6185724139213562, "advantages/var": 0.3826318312644936, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.13233724653148346, "grad_norm": 0.19746607159558405, "learning_rate": 9.983924160137624e-07, "loss": -0.0, "num_tokens": 5283289.0, "reward": 0.609375, "reward_std": 0.191086545586586, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 31 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.520887723343088e-10, "advantages/std": 0.6612839102745056, "advantages/var": 0.4372964099879404, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.13660618996798293, "grad_norm": 0.2770381026875717, "learning_rate": 9.9828351992675e-07, "loss": 0.0, "num_tokens": 5449062.0, "reward": 0.6640625, "reward_std": 0.2210792601108551, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 32 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.163983994864379e-09, "advantages/std": 0.495961457490921, "advantages/var": 0.24597776731651866, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.14087513340448238, "grad_norm": 0.22644008687355208, "learning_rate": 9.981710617875575e-07, "loss": 0.0, "num_tokens": 5610253.0, "reward": 0.6640625, "reward_std": 0.12164874374866486, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 33 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.687293680438322e-09, "advantages/std": 0.5960726737976074, "advantages/var": 0.3553026324482289, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.14514407684098185, "grad_norm": 0.19541733821886728, "learning_rate": 9.980550424001074e-07, "loss": 0.0, "num_tokens": 5800728.0, "reward": 0.59765625, "reward_std": 0.17886094748973846, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4913311004638672, "step": 34 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.812128792816569e-10, "advantages/std": 0.5960747599601746, "advantages/var": 0.3553051194615797, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.14941302027748132, "grad_norm": 0.24645692096714833, "learning_rate": 9.97935462593782e-07, "loss": 0.0, "num_tokens": 5971332.0, "reward": 0.58984375, "reward_std": 0.1822758913040161, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 35 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.272959047519299e-10, "advantages/std": 0.6402638554573059, "advantages/var": 0.4099378046050539, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.1536819637139808, "grad_norm": 0.27255815392539734, "learning_rate": 9.978123232234146e-07, "loss": -0.0, "num_tokens": 6133393.0, "reward": 0.59375, "reward_std": 0.17939528822898865, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.49209436774253845, "step": 36 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.659004112460079e-09, "advantages/std": 0.5726901888847351, "advantages/var": 0.3279740524448336, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.15795090715048027, "grad_norm": 0.2428885465959258, "learning_rate": 9.976856251692849e-07, "loss": -0.0, "num_tokens": 6306233.0, "reward": 0.62109375, "reward_std": 0.16728198528289795, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 37 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.4536934733273464e-10, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 0.1622198505869797, "grad_norm": 0.19391049030055343, "learning_rate": 9.975553693371123e-07, "loss": 0.0, "num_tokens": 6472899.0, "reward": 0.640625, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 38 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.140450770645173e-09, "advantages/std": 0.618564784526825, "advantages/var": 0.3826223926567174, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 0.1664887940234792, "grad_norm": 0.2458054922523459, "learning_rate": 9.974215566580498e-07, "loss": 0.0, "num_tokens": 6635464.0, "reward": 0.69140625, "reward_std": 0.18136723339557648, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 39 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.48969158764971e-09, "advantages/std": 0.4675893187522888, "advantages/var": 0.21863977101122956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 0.17075773745997866, "grad_norm": 0.1921600026379778, "learning_rate": 9.972841880886765e-07, "loss": 0.0, "num_tokens": 6805950.0, "reward": 0.56640625, "reward_std": 0.10205548256635666, "rewards/drgrpo_math_reward/mean": 0.56640625, "rewards/drgrpo_math_reward/std": 0.4965413510799408, "step": 40 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5483123064041138, "advantages/var": 0.30064638535419874, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.17502668089647813, "grad_norm": 0.1966993448153858, "learning_rate": 9.971432646109917e-07, "loss": -0.0, "num_tokens": 6974438.0, "reward": 0.6953125, "reward_std": 0.15729428827762604, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 41 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.2268175393589625e-09, "advantages/std": 0.522787868976593, "advantages/var": 0.2733071559490874, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "epoch": 0.17929562433297758, "grad_norm": 0.17565374090970223, "learning_rate": 9.969987872324075e-07, "loss": 0.0, "num_tokens": 7147608.0, "reward": 0.6328125, "reward_std": 0.13599355518817902, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 42 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.1249027630968986e-09, "advantages/std": 0.5960649847984314, "advantages/var": 0.35529346610275425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "epoch": 0.18356456776947705, "grad_norm": 0.20919270586057445, "learning_rate": 9.968507569857412e-07, "loss": -0.0, "num_tokens": 7331808.0, "reward": 0.578125, "reward_std": 0.16925784945487976, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49482619762420654, "step": 43 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439349784680999e-09, "advantages/std": 0.5726869702339172, "advantages/var": 0.3279703658757036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.18783351120597652, "grad_norm": 0.22710593855397082, "learning_rate": 9.966991749292086e-07, "loss": -0.0, "num_tokens": 7494201.0, "reward": 0.6328125, "reward_std": 0.16386458277702332, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 44 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5624291957442206e-09, "advantages/std": 0.5960734486579895, "advantages/var": 0.35530355619502885, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 0.192102454642476, "grad_norm": 0.1873875778423085, "learning_rate": 9.965440421464162e-07, "loss": 0.0, "num_tokens": 7668016.0, "reward": 0.68359375, "reward_std": 0.18003800511360168, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 45 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.672136567095384e-09, "advantages/std": 0.5227965712547302, "advantages/var": 0.2733162549157022, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.19637139807897544, "grad_norm": 0.19598414326761007, "learning_rate": 9.963853597463532e-07, "loss": -0.0, "num_tokens": 7831746.0, "reward": 0.6640625, "reward_std": 0.14518246054649353, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 46 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8820033105193726e-09, "advantages/std": 0.6185712814331055, "advantages/var": 0.38263043021379417, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.2006403415154749, "grad_norm": 0.23495355894033243, "learning_rate": 9.962231288633838e-07, "loss": 0.0, "num_tokens": 8005499.0, "reward": 0.66796875, "reward_std": 0.1909678727388382, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 47 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.388929790548923e-10, "advantages/std": 0.49596843123435974, "advantages/var": 0.24598468478107183, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.484375, "epoch": 0.20490928495197439, "grad_norm": 0.17028775334740143, "learning_rate": 9.960573506572389e-07, "loss": 0.0, "num_tokens": 8177309.0, "reward": 0.58984375, "reward_std": 0.1312469244003296, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 48 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.520271014273453e-09, "advantages/std": 0.5483061075210571, "advantages/var": 0.30063958754489306, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.20917822838847386, "grad_norm": 0.21939679418814842, "learning_rate": 9.958880263130084e-07, "loss": -0.0, "num_tokens": 8334953.0, "reward": 0.73046875, "reward_std": 0.14940111339092255, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 49 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.87778765062976e-09, "advantages/std": 0.4959679841995239, "advantages/var": 0.24598424135093921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.21344717182497333, "grad_norm": 0.17905287281997653, "learning_rate": 9.957151570411316e-07, "loss": -0.0, "num_tokens": 8498503.0, "reward": 0.6484375, "reward_std": 0.12901148200035095, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 50 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.7555886158092e-09, "advantages/std": 0.49596622586250305, "advantages/var": 0.2459824971962954, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.21771611526147278, "grad_norm": 0.19002154099000607, "learning_rate": 9.9553874407739e-07, "loss": -0.0, "num_tokens": 8642045.0, "reward": 0.6484375, "reward_std": 0.12836240231990814, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 51 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.7342560139901876e-09, "advantages/std": 0.5960723757743835, "advantages/var": 0.3553022771613179, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.22198505869797225, "grad_norm": 0.23688167107288094, "learning_rate": 9.95358788682897e-07, "loss": -0.0, "num_tokens": 8833114.0, "reward": 0.546875, "reward_std": 0.17833054065704346, "rewards/drgrpo_math_reward/mean": 0.546875, "rewards/drgrpo_math_reward/std": 0.4987730085849762, "step": 52 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 9.858661223793146e-09, "advantages/std": 0.49595409631729126, "advantages/var": 0.24597046565390102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 0.22625400213447172, "grad_norm": 0.21147731024352381, "learning_rate": 9.951752921440904e-07, "loss": 0.0, "num_tokens": 8968225.0, "reward": 0.8203125, "reward_std": 0.11481395363807678, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 53 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.1231940844604465e-09, "advantages/std": 0.5483027696609497, "advantages/var": 0.30063592721786847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.2305229455709712, "grad_norm": 0.19216694044272573, "learning_rate": 9.949882557727213e-07, "loss": -0.0, "num_tokens": 9128708.0, "reward": 0.7421875, "reward_std": 0.1459837108850479, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 54 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524406527748425e-09, "advantages/std": 0.5726915001869202, "advantages/var": 0.3279755543863452, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.23479188900747064, "grad_norm": 0.1931123684231175, "learning_rate": 9.947976809058467e-07, "loss": -0.0, "num_tokens": 9278853.0, "reward": 0.80859375, "reward_std": 0.16951987147331238, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 55 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.626250454143894e-09, "advantages/std": 0.5726808905601501, "advantages/var": 0.32796340241276667, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 0.2390608324439701, "grad_norm": 0.22436070190282476, "learning_rate": 9.946035689058187e-07, "loss": 0.0, "num_tokens": 9437868.0, "reward": 0.69921875, "reward_std": 0.15756022930145264, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 56 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.14039371842048e-09, "advantages/std": 0.6185733079910278, "advantages/var": 0.382632937358963, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "epoch": 0.24332977588046958, "grad_norm": 0.22106574157254638, "learning_rate": 9.94405921160275e-07, "loss": -0.0, "num_tokens": 9624456.0, "reward": 0.50390625, "reward_std": 0.19108900427818298, "rewards/drgrpo_math_reward/mean": 0.50390625, "rewards/drgrpo_math_reward/std": 0.5009641647338867, "step": 57 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.687265558082829e-09, "advantages/std": 0.596076250076294, "advantages/var": 0.3553068959050165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 0.24759871931696906, "grad_norm": 0.21659403501122557, "learning_rate": 9.942047390821295e-07, "loss": 0.0, "num_tokens": 9797629.0, "reward": 0.66015625, "reward_std": 0.18292498588562012, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 58 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.317995276958649e-09, "advantages/std": 0.5726912021636963, "advantages/var": 0.32797521303569965, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.2518676627534685, "grad_norm": 0.21183684386735038, "learning_rate": 9.940000241095616e-07, "loss": -0.0, "num_tokens": 9959727.0, "reward": 0.6640625, "reward_std": 0.16898946464061737, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 59 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196689260745064e-09, "advantages/std": 0.5726897716522217, "advantages/var": 0.3279735745550738, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 0.256136606189968, "grad_norm": 0.2038191691018717, "learning_rate": 9.937917777060056e-07, "loss": 0.0, "num_tokens": 10104572.0, "reward": 0.75, "reward_std": 0.16834037005901337, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 60 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.9060964248770786e-10, "advantages/std": 0.596069872379303, "advantages/var": 0.35529929275827854, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.26040554962646745, "grad_norm": 0.26121318948558775, "learning_rate": 9.935800013601413e-07, "loss": 0.0, "num_tokens": 10247777.0, "reward": 0.78515625, "reward_std": 0.17597398161888123, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 61 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.4083615633936797e-09, "advantages/std": 0.6612808704376221, "advantages/var": 0.4372923896067391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.2646744930629669, "grad_norm": 0.26063853689599825, "learning_rate": 9.93364696585883e-07, "loss": -0.0, "num_tokens": 10425033.0, "reward": 0.6015625, "reward_std": 0.21713145077228546, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4905354380607605, "step": 62 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125766567692018e-09, "advantages/std": 0.5227915048599243, "advantages/var": 0.27331095755370427, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.2689434364994664, "grad_norm": 0.2080920271905109, "learning_rate": 9.931458649223683e-07, "loss": 0.0, "num_tokens": 10576767.0, "reward": 0.69140625, "reward_std": 0.13782215118408203, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 63 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516794888289669e-09, "advantages/std": 0.6185730695724487, "advantages/var": 0.3826326424002815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.27321237993596587, "grad_norm": 0.2266001825573351, "learning_rate": 9.929235079339465e-07, "loss": 0.0, "num_tokens": 10730985.0, "reward": 0.6328125, "reward_std": 0.19055859744548798, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 64 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.273877537771054e-08, "advantages/std": 0.5483195185661316, "advantages/var": 0.3006542944405943, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 0.27748132337246534, "grad_norm": 0.2285948803926055, "learning_rate": 9.926976272101692e-07, "loss": 0.0, "num_tokens": 10884864.0, "reward": 0.73046875, "reward_std": 0.16477571427822113, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 65 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4959557056427002, "advantages/var": 0.24597206195954868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.28175026680896476, "grad_norm": 0.1844509849519164, "learning_rate": 9.924682243657778e-07, "loss": 0.0, "num_tokens": 11025620.0, "reward": 0.7265625, "reward_std": 0.11705183982849121, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 66 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.8819951499331522e-09, "advantages/std": 0.6185739636421204, "advantages/var": 0.38263374849592324, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "epoch": 0.28601921024546423, "grad_norm": 0.24663370784559255, "learning_rate": 9.922353010406917e-07, "loss": -0.0, "num_tokens": 11203762.0, "reward": 0.58203125, "reward_std": 0.19385485351085663, "rewards/drgrpo_math_reward/mean": 0.58203125, "rewards/drgrpo_math_reward/std": 0.49419113993644714, "step": 67 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.812152228159681e-10, "advantages/std": 0.5960729718208313, "advantages/var": 0.35530298773531754, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "epoch": 0.2902881536819637, "grad_norm": 0.20863606400263202, "learning_rate": 9.91998858899997e-07, "loss": -0.0, "num_tokens": 11390686.0, "reward": 0.6328125, "reward_std": 0.17939136922359467, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 68 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 1.0615803089720003e-08, "advantages/std": 0.5483114123344421, "advantages/var": 0.3006454048961906, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 0.2945570971184632, "grad_norm": 0.21968065629436126, "learning_rate": 9.91758899633935e-07, "loss": 0.0, "num_tokens": 11532669.0, "reward": 0.78515625, "reward_std": 0.15570303797721863, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 69 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.3436386378449705e-09, "advantages/std": 0.5960747599601746, "advantages/var": 0.3553051194615797, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 0.29882604055496265, "grad_norm": 0.22408499383948063, "learning_rate": 9.915154249578892e-07, "loss": -0.0, "num_tokens": 11694199.0, "reward": 0.73046875, "reward_std": 0.1822758913040161, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 70 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.246355856158812e-09, "advantages/std": 0.548306941986084, "advantages/var": 0.30064050263013087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 0.3030949839914621, "grad_norm": 0.23142067685000983, "learning_rate": 9.91268436612374e-07, "loss": 0.0, "num_tokens": 11840347.0, "reward": 0.66796875, "reward_std": 0.15057815611362457, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 71 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.3436733225620384e-09, "advantages/std": 0.5960659384727478, "advantages/var": 0.35529460300739757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.3073639274279616, "grad_norm": 0.23038625562724183, "learning_rate": 9.91017936363021e-07, "loss": 0.0, "num_tokens": 11972459.0, "reward": 0.82421875, "reward_std": 0.16926030814647675, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3813795745372772, "step": 72 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46758967638015747, "advantages/var": 0.2186401054573004, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.31163287086446106, "grad_norm": 0.176058018727384, "learning_rate": 9.90763926000568e-07, "loss": -0.0, "num_tokens": 12133622.0, "reward": 0.6640625, "reward_std": 0.10258588939905167, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 73 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.468471563051712e-09, "advantages/std": 0.5960767865180969, "advantages/var": 0.3553075354257409, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "epoch": 0.31590181430096054, "grad_norm": 0.21721921107191133, "learning_rate": 9.90506407340845e-07, "loss": 0.0, "num_tokens": 12312108.0, "reward": 0.48828125, "reward_std": 0.18398582935333252, "rewards/drgrpo_math_reward/mean": 0.48828125, "rewards/drgrpo_math_reward/std": 0.5008418560028076, "step": 74 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.6363627332899125e-10, "advantages/std": 0.6402844190597534, "advantages/var": 0.4099641372906859, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.32017075773745995, "grad_norm": 0.2705578088558078, "learning_rate": 9.902453822247614e-07, "loss": -0.0, "num_tokens": 12496457.0, "reward": 0.66796875, "reward_std": 0.20608291029930115, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 75 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.33610210092994e-09, "advantages/std": 0.5227833390235901, "advantages/var": 0.27330241956065393, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.421875, "epoch": 0.3244397011739594, "grad_norm": 0.21517258013047688, "learning_rate": 9.899808525182934e-07, "loss": -0.0, "num_tokens": 12676832.0, "reward": 0.546875, "reward_std": 0.12927989661693573, "rewards/drgrpo_math_reward/mean": 0.546875, "rewards/drgrpo_math_reward/std": 0.4987730085849762, "step": 76 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.272772181567429e-10, "advantages/std": 0.6402803063392639, "advantages/var": 0.40995887068590164, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.3287086446104589, "grad_norm": 0.22787543124127765, "learning_rate": 9.897128201124698e-07, "loss": -0.0, "num_tokens": 12856929.0, "reward": 0.5859375, "reward_std": 0.20042762160301208, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.4935242533683777, "step": 77 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 8.917246484022717e-09, "advantages/std": 0.5483131408691406, "advantages/var": 0.30064730044978205, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 0.3329775880469584, "grad_norm": 0.21594927536319913, "learning_rate": 9.894412869233596e-07, "loss": -0.0, "num_tokens": 13041959.0, "reward": 0.609375, "reward_std": 0.15847134590148926, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 78 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.0539255098620102e-08, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.33724653148345785, "grad_norm": 0.1842527654890239, "learning_rate": 9.89166254892057e-07, "loss": 0.0, "num_tokens": 13212105.0, "reward": 0.7421875, "reward_std": 0.18543373048305511, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 79 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.098397946146296e-09, "advantages/std": 0.5726847648620605, "advantages/var": 0.3279678399051136, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.3415154749199573, "grad_norm": 0.18973959019347075, "learning_rate": 9.888877259846684e-07, "loss": 0.0, "num_tokens": 13372523.0, "reward": 0.7265625, "reward_std": 0.16044965386390686, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 80 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 7.904410095893117e-09, "advantages/std": 0.6185715794563293, "advantages/var": 0.38263079891109797, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "epoch": 0.3457844183564568, "grad_norm": 0.24357128545071005, "learning_rate": 9.886057021922982e-07, "loss": 0.0, "num_tokens": 13556828.0, "reward": 0.59375, "reward_std": 0.1914982795715332, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.49209436774253845, "step": 81 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.030935599219189e-09, "advantages/std": 0.5960730910301208, "advantages/var": 0.35530312985020274, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.35005336179295626, "grad_norm": 0.20902800614565434, "learning_rate": 9.883201855310348e-07, "loss": 0.0, "num_tokens": 13716324.0, "reward": 0.6875, "reward_std": 0.17780256271362305, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 82 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.5454588894424267e-09, "advantages/std": 0.6402831673622131, "advantages/var": 0.40996253440738784, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.35432230522945574, "grad_norm": 0.2840754464428357, "learning_rate": 9.880311780419353e-07, "loss": 0.0, "num_tokens": 13871107.0, "reward": 0.75390625, "reward_std": 0.20384500920772552, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 83 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 5.975091018752593e-09, "advantages/std": 0.701403796672821, "advantages/var": 0.4919672859870481, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.35859124866595515, "grad_norm": 0.2649071214396161, "learning_rate": 9.877386817910116e-07, "loss": 0.0, "num_tokens": 14054447.0, "reward": 0.58984375, "reward_std": 0.2580205500125885, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 84 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628885646149507e-09, "advantages/std": 0.5227907299995422, "advantages/var": 0.27331014737345427, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.3628601921024546, "grad_norm": 0.19237083737953337, "learning_rate": 9.874426988692163e-07, "loss": 0.0, "num_tokens": 14221007.0, "reward": 0.80859375, "reward_std": 0.13835012912750244, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 85 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.011203265744322e-09, "advantages/std": 0.6185716986656189, "advantages/var": 0.3826309463900692, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.3671291355389541, "grad_norm": 0.2315622924620685, "learning_rate": 9.871432313924253e-07, "loss": -0.0, "num_tokens": 14385112.0, "reward": 0.671875, "reward_std": 0.18990950286388397, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 86 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.7789658227491635e-09, "advantages/std": 0.36967357993125916, "advantages/var": 0.13665855569919305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 0.37139807897545357, "grad_norm": 0.15106295394289238, "learning_rate": 9.868402815014265e-07, "loss": 0.0, "num_tokens": 14535844.0, "reward": 0.65234375, "reward_std": 0.07232724130153656, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 87 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.694495078714221e-10, "advantages/std": 0.49596524238586426, "advantages/var": 0.24598152165486908, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.37566702241195304, "grad_norm": 0.1867263412626549, "learning_rate": 9.865338513619004e-07, "loss": -0.0, "num_tokens": 14695433.0, "reward": 0.69140625, "reward_std": 0.12677115201950073, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 88 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.9090742089424973e-09, "advantages/std": 0.6402879357337952, "advantages/var": 0.4099686406462446, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.3799359658484525, "grad_norm": 0.2450922826903169, "learning_rate": 9.86223943164408e-07, "loss": 0.0, "num_tokens": 14857936.0, "reward": 0.6796875, "reward_std": 0.2106773555278778, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 89 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726829767227173, "advantages/var": 0.32796579182799235, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.390625, "epoch": 0.384204909284952, "grad_norm": 0.2024341681897372, "learning_rate": 9.859105591243726e-07, "loss": -0.0, "num_tokens": 15042816.0, "reward": 0.62109375, "reward_std": 0.15927013754844666, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 90 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.285282070613587e-09, "advantages/std": 0.5726843476295471, "advantages/var": 0.32796736201987997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.38847385272145146, "grad_norm": 0.21879951538884707, "learning_rate": 9.85593701482066e-07, "loss": 0.0, "num_tokens": 15198870.0, "reward": 0.66796875, "reward_std": 0.16150802373886108, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 91 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.6597939099176911e-09, "advantages/std": 0.7013841867446899, "advantages/var": 0.4919397774155101, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.3927427961579509, "grad_norm": 0.32495948531780694, "learning_rate": 9.852733725025907e-07, "loss": 0.0, "num_tokens": 15351762.0, "reward": 0.65234375, "reward_std": 0.22962543368339539, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 92 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.0082834527591325e-09, "advantages/std": 0.5227863192558289, "advantages/var": 0.2733055356010574, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.39701173959445035, "grad_norm": 0.24303979574096995, "learning_rate": 9.849495744758654e-07, "loss": 0.0, "num_tokens": 15500943.0, "reward": 0.6953125, "reward_std": 0.1337556540966034, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 93 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.408336252804626e-09, "advantages/std": 0.4959695637226105, "advantages/var": 0.24598580813919657, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.4012806830309498, "grad_norm": 0.18723812086741365, "learning_rate": 9.84622309716607e-07, "loss": -0.0, "num_tokens": 15646996.0, "reward": 0.7265625, "reward_std": 0.13124938309192657, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 94 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.008182001769061e-09, "advantages/std": 0.522799551486969, "advantages/var": 0.27331937103497594, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.4055496264674493, "grad_norm": 0.22017340183627318, "learning_rate": 9.842915805643156e-07, "loss": 0.0, "num_tokens": 15808802.0, "reward": 0.6796875, "reward_std": 0.14795321226119995, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 95 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 7.724561021577794e-09, "advantages/std": 0.5726904273033142, "advantages/var": 0.3279743255248526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 0.40981856990394877, "grad_norm": 0.2742430048994587, "learning_rate": 9.839573893832563e-07, "loss": 0.0, "num_tokens": 15962018.0, "reward": 0.65625, "reward_std": 0.16781240701675415, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 96 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.285112648635477e-09, "advantages/std": 0.5727027058601379, "advantages/var": 0.3279883892995237, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 0.41408751334044824, "grad_norm": 0.25942822428286266, "learning_rate": 9.836197385624432e-07, "loss": 0.0, "num_tokens": 16118197.0, "reward": 0.6484375, "reward_std": 0.18212617933750153, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 97 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131044932268215e-10, "advantages/std": 0.5726954936981201, "advantages/var": 0.32798012850213354, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 0.4183564567769477, "grad_norm": 0.21659055119101517, "learning_rate": 9.832786305156228e-07, "loss": 0.0, "num_tokens": 16274190.0, "reward": 0.6953125, "reward_std": 0.17411433160305023, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 98 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.246282000263511e-10, "advantages/std": 0.548316478729248, "advantages/var": 0.3006509608460419, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.4226254002134472, "grad_norm": 0.2244429283719818, "learning_rate": 9.829340676812552e-07, "loss": -0.0, "num_tokens": 16417922.0, "reward": 0.69140625, "reward_std": 0.1618887335062027, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 99 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8819860826981316e-09, "advantages/std": 0.6185769438743591, "advantages/var": 0.38263743549294205, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 0.42689434364994666, "grad_norm": 0.2245456822690171, "learning_rate": 9.825860525224981e-07, "loss": -0.0, "num_tokens": 16570727.0, "reward": 0.7890625, "reward_std": 0.19727224111557007, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 100 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.065590591029637e-09, "advantages/std": 0.5726858973503113, "advantages/var": 0.32796913702393127, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.4311632870864461, "grad_norm": 0.24390358364587256, "learning_rate": 9.822345875271883e-07, "loss": 0.0, "num_tokens": 16718926.0, "reward": 0.76171875, "reward_std": 0.1621571183204651, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 101 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 5.281322540434018e-09, "advantages/std": 0.6612850427627563, "advantages/var": 0.4372979077817405, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.43543223052294555, "grad_norm": 0.27030435312221507, "learning_rate": 9.818796752078243e-07, "loss": -0.0, "num_tokens": 16898303.0, "reward": 0.55859375, "reward_std": 0.22290295362472534, "rewards/drgrpo_math_reward/mean": 0.55859375, "rewards/drgrpo_math_reward/std": 0.4975275993347168, "step": 102 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6985576679666268e-09, "advantages/std": 0.5483019948005676, "advantages/var": 0.3006350775022817, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 0.439701173959445, "grad_norm": 0.1900534271188019, "learning_rate": 9.815213181015487e-07, "loss": 0.0, "num_tokens": 17065727.0, "reward": 0.7265625, "reward_std": 0.14651167392730713, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 103 }, { "advantages/mean": -6.28642737865448e-09, "advantages/snr": 1.0546620649499414e-08, "advantages/std": 0.5960608124732971, "advantages/var": 0.3552884921663271, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.4439701173959445, "grad_norm": 0.28215753816918543, "learning_rate": 9.811595187701293e-07, "loss": 0.0, "num_tokens": 17218805.0, "reward": 0.6875, "reward_std": 0.1641329973936081, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 104 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.5155324825148615e-09, "advantages/std": 0.5960621237754822, "advantages/var": 0.35529005539973824, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.44823906083244397, "grad_norm": 0.24532268661495166, "learning_rate": 9.807942797999412e-07, "loss": -0.0, "num_tokens": 17382522.0, "reward": 0.5546875, "reward_std": 0.16637088358402252, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49797385931015015, "step": 105 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.659041433501714e-09, "advantages/std": 0.5726843476295471, "advantages/var": 0.32796736201987997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.45250800426894344, "grad_norm": 0.251796558756451, "learning_rate": 9.804256038019481e-07, "loss": 0.0, "num_tokens": 17534509.0, "reward": 0.79296875, "reward_std": 0.16150803864002228, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 106 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 6.8315477033413135e-09, "advantages/std": 0.6816336512565613, "advantages/var": 0.4646244345253514, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 0.4567769477054429, "grad_norm": 0.22918055149473104, "learning_rate": 9.800534934116842e-07, "loss": 0.0, "num_tokens": 17707238.0, "reward": 0.6953125, "reward_std": 0.23330241441726685, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 107 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.94494343648925e-09, "advantages/std": 0.5483027696609497, "advantages/var": 0.30063592721786847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.4610458911419424, "grad_norm": 0.20682523205492293, "learning_rate": 9.796779512892345e-07, "loss": -0.0, "num_tokens": 17877082.0, "reward": 0.6953125, "reward_std": 0.1459837108850479, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 108 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983506540239536e-09, "advantages/std": 0.4675893187522888, "advantages/var": 0.21863977101122956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.46531483457844186, "grad_norm": 0.20803318441731042, "learning_rate": 9.792989801192167e-07, "loss": -0.0, "num_tokens": 18029785.0, "reward": 0.76171875, "reward_std": 0.10205547511577606, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 109 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.572338921941043e-09, "advantages/std": 0.4959617853164673, "advantages/var": 0.24597809249429758, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.4695837780149413, "grad_norm": 0.1849990700457602, "learning_rate": 9.78916582610761e-07, "loss": -0.0, "num_tokens": 18185127.0, "reward": 0.73046875, "reward_std": 0.12217915058135986, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 110 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.2584118068471646e-09, "advantages/std": 0.6185691356658936, "advantages/var": 0.3826277755984506, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.47385272145144075, "grad_norm": 0.2517914798778077, "learning_rate": 9.78530761497492e-07, "loss": -0.0, "num_tokens": 18350297.0, "reward": 0.5859375, "reward_std": 0.18543371558189392, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.4935242533683777, "step": 111 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 6.831425840696212e-10, "advantages/std": 0.6816458106040955, "advantages/var": 0.4646410111141144, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.4781216648879402, "grad_norm": 0.24095208318789144, "learning_rate": 9.781415195375076e-07, "loss": -0.0, "num_tokens": 18528245.0, "reward": 0.57421875, "reward_std": 0.24974030256271362, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 112 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.979300026668281e-10, "advantages/std": 0.4675971269607544, "advantages/var": 0.21864707314195186, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.4823906083244397, "grad_norm": 0.1594297751631633, "learning_rate": 9.77748859513361e-07, "loss": -0.0, "num_tokens": 18687978.0, "reward": 0.68359375, "reward_std": 0.10889026522636414, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 113 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.3436505898950562e-09, "advantages/std": 0.596071720123291, "advantages/var": 0.355301495530739, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.48665955176093917, "grad_norm": 0.20941137193092302, "learning_rate": 9.77352784232039e-07, "loss": 0.0, "num_tokens": 18869022.0, "reward": 0.6171875, "reward_std": 0.17885850369930267, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 114 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 8.537693145596392e-09, "advantages/std": 0.5726890563964844, "advantages/var": 0.32797275531629566, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.49092849519743864, "grad_norm": 0.25209405893787906, "learning_rate": 9.769532965249435e-07, "loss": 0.0, "num_tokens": 19019613.0, "reward": 0.7265625, "reward_std": 0.16557452082633972, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 115 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 7.74590481727937e-09, "advantages/std": 0.6612880229949951, "advantages/var": 0.4373018493566292, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 0.4951974386339381, "grad_norm": 0.2484135721878627, "learning_rate": 9.765503992478703e-07, "loss": -0.0, "num_tokens": 19201913.0, "reward": 0.56640625, "reward_std": 0.22685076296329498, "rewards/drgrpo_math_reward/mean": 0.56640625, "rewards/drgrpo_math_reward/std": 0.4965413510799408, "step": 116 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5050695213580615e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.4994663820704376, "grad_norm": 0.20850762851886148, "learning_rate": 9.761440952809897e-07, "loss": 0.0, "num_tokens": 19370768.0, "reward": 0.71875, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 117 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1718284587501684e-09, "advantages/std": 0.5960701107978821, "advantages/var": 0.3552995769865994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "epoch": 0.503735325506937, "grad_norm": 0.19440456705898, "learning_rate": 9.75734387528824e-07, "loss": -0.0, "num_tokens": 19559213.0, "reward": 0.58984375, "reward_std": 0.17609019577503204, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 118 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899591307565781e-09, "advantages/std": 0.40494683384895325, "advantages/var": 0.16398193824429175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.5080042689434365, "grad_norm": 0.1625977913542358, "learning_rate": 9.75321278920229e-07, "loss": 0.0, "num_tokens": 19693612.0, "reward": 0.8203125, "reward_std": 0.07825092226266861, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 119 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.6556072918866842e-09, "advantages/std": 0.7014008164405823, "advantages/var": 0.4919631053035154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 0.512273212379936, "grad_norm": 0.27257864175109514, "learning_rate": 9.749047724083714e-07, "loss": -0.0, "num_tokens": 19880428.0, "reward": 0.55078125, "reward_std": 0.25395649671554565, "rewards/drgrpo_math_reward/mean": 0.55078125, "rewards/drgrpo_math_reward/std": 0.49838894605636597, "step": 120 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.520307620092389e-09, "advantages/std": 0.5483024716377258, "advantages/var": 0.30063560040403914, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.5165421558164355, "grad_norm": 0.23162261371168508, "learning_rate": 9.74484870970709e-07, "loss": -0.0, "num_tokens": 20019486.0, "reward": 0.75390625, "reward_std": 0.1454533040523529, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 121 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.5478002194722765e-09, "advantages/std": 0.5483098030090332, "advantages/var": 0.3006436400758048, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 0.5208110992529349, "grad_norm": 0.22311043728478686, "learning_rate": 9.74061577608968e-07, "loss": 0.0, "num_tokens": 20199121.0, "reward": 0.58203125, "reward_std": 0.15505394339561462, "rewards/drgrpo_math_reward/mean": 0.58203125, "rewards/drgrpo_math_reward/std": 0.49419113993644714, "step": 122 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.369594716994514e-09, "advantages/std": 0.5483016967773438, "advantages/var": 0.3006347506889142, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "epoch": 0.5250800426894343, "grad_norm": 0.25070328563799926, "learning_rate": 9.736348953491221e-07, "loss": 0.0, "num_tokens": 20377999.0, "reward": 0.57421875, "reward_std": 0.14598125219345093, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 123 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 7.469034298941239e-09, "advantages/std": 0.4675918519496918, "advantages/var": 0.21864214000974247, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.484375, "epoch": 0.5293489861259338, "grad_norm": 0.18374980419818973, "learning_rate": 9.732048272413725e-07, "loss": -0.0, "num_tokens": 20549287.0, "reward": 0.65234375, "reward_std": 0.10376540571451187, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 124 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.468483593103085e-09, "advantages/std": 0.5960754752159119, "advantages/var": 0.35530597215387516, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.5336179295624333, "grad_norm": 0.25946092115826264, "learning_rate": 9.727713763601226e-07, "loss": 0.0, "num_tokens": 20713814.0, "reward": 0.64453125, "reward_std": 0.1817479431629181, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 125 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 2.987551856360924e-09, "advantages/std": 0.7014023065567017, "advantages/var": 0.4919651956430613, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.5378868729989328, "grad_norm": 0.2839156360779615, "learning_rate": 9.723345458039593e-07, "loss": 0.0, "num_tokens": 20890843.0, "reward": 0.6015625, "reward_std": 0.25513601303100586, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4905354380607605, "step": 126 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.226836580992585e-09, "advantages/std": 0.5227833986282349, "advantages/var": 0.2733024818812879, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.5421558164354322, "grad_norm": 0.24753063560244626, "learning_rate": 9.718943386956296e-07, "loss": -0.0, "num_tokens": 21043874.0, "reward": 0.7109375, "reward_std": 0.13098491728305817, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 127 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.281328252797118e-09, "advantages/std": 0.661284327507019, "advantages/var": 0.4372969618064104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.5464247598719317, "grad_norm": 0.27079609566069784, "learning_rate": 9.714507581820179e-07, "loss": 0.0, "num_tokens": 21215949.0, "reward": 0.60546875, "reward_std": 0.22172591090202332, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 128 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196727343226576e-09, "advantages/std": 0.5726879835128784, "advantages/var": 0.3279715264600469, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.5506937033084311, "grad_norm": 0.23678676576756394, "learning_rate": 9.71003807434124e-07, "loss": -0.0, "num_tokens": 21390672.0, "reward": 0.62109375, "reward_std": 0.1638670563697815, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 129 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.1291945873965149e-09, "advantages/std": 0.6185753345489502, "advantages/var": 0.38263544451234566, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.5549626467449307, "grad_norm": 0.24438298608082065, "learning_rate": 9.7055348964704e-07, "loss": -0.0, "num_tokens": 21549093.0, "reward": 0.66796875, "reward_std": 0.19450394809246063, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 130 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9916779936341355e-09, "advantages/std": 0.4676069915294647, "advantages/var": 0.2186562985272369, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.5592315901814301, "grad_norm": 0.17529735964744536, "learning_rate": 9.700998080399285e-07, "loss": -0.0, "num_tokens": 21712261.0, "reward": 0.671875, "reward_std": 0.11849337071180344, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 131 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524223734538706e-09, "advantages/std": 0.572694718837738, "advantages/var": 0.3279792409846358, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.5635005336179295, "grad_norm": 0.2084611382117928, "learning_rate": 9.696427658559982e-07, "loss": 0.0, "num_tokens": 21881266.0, "reward": 0.640625, "reward_std": 0.172937273979187, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 132 }, { "advantages/mean": -5.3551048040390015e-09, "advantages/snr": 9.350906047837328e-09, "advantages/std": 0.5726829767227173, "advantages/var": 0.32796579182799235, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 0.567769477054429, "grad_norm": 0.22218752716256074, "learning_rate": 9.691823663624816e-07, "loss": 0.0, "num_tokens": 22042837.0, "reward": 0.75390625, "reward_std": 0.15927013754844666, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 133 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.226790627738806e-09, "advantages/std": 0.5227941870689392, "advantages/var": 0.273313762033073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.5720384204909285, "grad_norm": 0.22175051939206694, "learning_rate": 9.687186128506113e-07, "loss": 0.0, "num_tokens": 22214689.0, "reward": 0.6328125, "reward_std": 0.14176751673221588, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 134 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.258423993525738e-09, "advantages/std": 0.6185657978057861, "advantages/var": 0.3826236462151087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.576307363927428, "grad_norm": 0.23234493551308205, "learning_rate": 9.682515086355972e-07, "loss": -0.0, "num_tokens": 22391548.0, "reward": 0.609375, "reward_std": 0.1830746978521347, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 135 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.906095253094573e-09, "advantages/std": 0.5960700511932373, "advantages/var": 0.35529950592950854, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.5805763073639274, "grad_norm": 0.24149835642056233, "learning_rate": 9.67781057056601e-07, "loss": 0.0, "num_tokens": 22569499.0, "reward": 0.62890625, "reward_std": 0.1743851751089096, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 136 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.571054469954958e-09, "advantages/std": 0.5227965116500854, "advantages/var": 0.27331619259349793, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.5848452508004269, "grad_norm": 0.22422672779414102, "learning_rate": 9.673072614767146e-07, "loss": 0.0, "num_tokens": 22711127.0, "reward": 0.7421875, "reward_std": 0.1434774398803711, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 137 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.636395230754275e-10, "advantages/std": 0.640278697013855, "advantages/var": 0.4099568098497599, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.5891141942369263, "grad_norm": 0.25984251362548044, "learning_rate": 9.668301252829343e-07, "loss": 0.0, "num_tokens": 22891737.0, "reward": 0.66015625, "reward_std": 0.19924810528755188, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 138 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.5723713064881815e-09, "advantages/std": 0.4959593415260315, "advantages/var": 0.24597566844693475, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 0.5933831376734259, "grad_norm": 0.2247332333432713, "learning_rate": 9.66349651886138e-07, "loss": 0.0, "num_tokens": 23038921.0, "reward": 0.76171875, "reward_std": 0.12046922743320465, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 139 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.812210035940531e-09, "advantages/std": 0.5960685610771179, "advantages/var": 0.35529772950454586, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 0.5976520811099253, "grad_norm": 0.2377338133628425, "learning_rate": 9.658658447210594e-07, "loss": 0.0, "num_tokens": 23200038.0, "reward": 0.68359375, "reward_std": 0.1737360954284668, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 140 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3663241168119124e-09, "advantages/std": 0.6816263794898987, "advantages/var": 0.46461452121650737, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.6019210245464247, "grad_norm": 0.2712603744994714, "learning_rate": 9.653787072462643e-07, "loss": 0.0, "num_tokens": 23383317.0, "reward": 0.5625, "reward_std": 0.2213476598262787, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49705013632774353, "step": 141 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.49595075845718384, "advantages/var": 0.2459671548142559, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.6061899679829242, "grad_norm": 0.17658537259355056, "learning_rate": 9.648882429441256e-07, "loss": -0.0, "num_tokens": 23547586.0, "reward": 0.578125, "reward_std": 0.11192697286605835, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49482619762420654, "step": 142 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.96696941108981e-09, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.6104589114194237, "grad_norm": 0.1844179462576979, "learning_rate": 9.64394455320799e-07, "loss": 0.0, "num_tokens": 23701368.0, "reward": 0.69921875, "reward_std": 0.10376539826393127, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 143 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.5209299319642286e-10, "advantages/std": 0.6612759828567505, "advantages/var": 0.43728592550316137, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.6147278548559232, "grad_norm": 0.25430397247621467, "learning_rate": 9.63897347906197e-07, "loss": -0.0, "num_tokens": 23881005.0, "reward": 0.671875, "reward_std": 0.20976868271827698, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 144 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.505114884350798e-09, "advantages/std": 0.5726709365844727, "advantages/var": 0.3279520016085371, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.6189967982924226, "grad_norm": 0.31751555496280937, "learning_rate": 9.633969242539642e-07, "loss": 0.0, "num_tokens": 24039875.0, "reward": 0.66015625, "reward_std": 0.14507260918617249, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 145 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0688690066716281e-08, "advantages/std": 0.5227895379066467, "advantages/var": 0.2733089009446452, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.6232657417289221, "grad_norm": 0.21736187090374912, "learning_rate": 9.628931879414516e-07, "loss": 0.0, "num_tokens": 24193969.0, "reward": 0.65625, "reward_std": 0.13664264976978302, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 146 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814402201807623e-09, "advantages/std": 0.5227919220924377, "advantages/var": 0.2733113938051055, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.6275346851654215, "grad_norm": 0.221860019654238, "learning_rate": 9.623861425696917e-07, "loss": -0.0, "num_tokens": 24361745.0, "reward": 0.6484375, "reward_std": 0.14005759358406067, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 147 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.077846141054734e-09, "advantages/std": 0.5960791707038879, "advantages/var": 0.3553103777470348, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.6318036286019211, "grad_norm": 0.24882667930071267, "learning_rate": 9.618757917633722e-07, "loss": 0.0, "num_tokens": 24499558.0, "reward": 0.75, "reward_std": 0.18793118000030518, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 148 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.7639203017764606e-10, "advantages/std": 0.618585467338562, "advantages/var": 0.38264798040246717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.6360725720384205, "grad_norm": 0.25328994588616843, "learning_rate": 9.613621391708097e-07, "loss": 0.0, "num_tokens": 24672886.0, "reward": 0.53125, "reward_std": 0.20699402689933777, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.5, "step": 149 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196822550470801e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 0.6403415154749199, "grad_norm": 0.310149121041335, "learning_rate": 9.608451884639248e-07, "loss": 0.0, "num_tokens": 24837025.0, "reward": 0.6171875, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 150 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 4.8931345825649164e-09, "advantages/std": 0.6185806393623352, "advantages/var": 0.3826420073939154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.6446104589114194, "grad_norm": 0.27449635019824886, "learning_rate": 9.603249433382144e-07, "loss": 0.0, "num_tokens": 24996536.0, "reward": 0.62890625, "reward_std": 0.20186668634414673, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 151 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.337556201943589e-09, "advantages/std": 0.6612882614135742, "advantages/var": 0.4373021646833877, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.6488794023479189, "grad_norm": 0.22105427222019133, "learning_rate": 9.598014075127264e-07, "loss": -0.0, "num_tokens": 25172186.0, "reward": 0.6171875, "reward_std": 0.22738119959831238, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 152 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 9.581590371992046e-09, "advantages/std": 0.43739622831344604, "advantages/var": 0.19131546054282822, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.6531483457844184, "grad_norm": 0.2409562006634107, "learning_rate": 9.592745847300331e-07, "loss": 0.0, "num_tokens": 25323454.0, "reward": 0.6953125, "reward_std": 0.09495474398136139, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 153 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917335935919526e-09, "advantages/std": 0.4675939381122589, "advantages/var": 0.21864409095933102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 0.6574172892209178, "grad_norm": 0.22997997409735074, "learning_rate": 9.587444787562037e-07, "loss": -0.0, "num_tokens": 25492051.0, "reward": 0.6484375, "reward_std": 0.1065337061882019, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 154 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196999005160248e-09, "advantages/std": 0.5726752281188965, "advantages/var": 0.3279569169010301, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 0.6616862326574173, "grad_norm": 0.24108914953717586, "learning_rate": 9.582110933807776e-07, "loss": 0.0, "num_tokens": 25657409.0, "reward": 0.56640625, "reward_std": 0.15019746124744415, "rewards/drgrpo_math_reward/mean": 0.56640625, "rewards/drgrpo_math_reward/std": 0.4965413510799408, "step": 155 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6402879357337952, "advantages/var": 0.4099686406462446, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.6659551760939167, "grad_norm": 0.24492241583107927, "learning_rate": 9.576744324167378e-07, "loss": 0.0, "num_tokens": 25817785.0, "reward": 0.71875, "reward_std": 0.2106773555278778, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 156 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.272752356351317e-09, "advantages/std": 0.6402793526649475, "advantages/var": 0.4099576494490442, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.6702241195304163, "grad_norm": 0.2652156850382761, "learning_rate": 9.571344997004831e-07, "loss": -0.0, "num_tokens": 25981209.0, "reward": 0.67578125, "reward_std": 0.19872015714645386, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 157 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 5.985519380243054e-09, "advantages/std": 0.6612827777862549, "advantages/var": 0.43729491219670535, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 0.6744930629669157, "grad_norm": 0.24167674819050053, "learning_rate": 9.565912990918014e-07, "loss": 0.0, "num_tokens": 26153857.0, "reward": 0.671875, "reward_std": 0.2205463945865631, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 158 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 8.917546993755515e-09, "advantages/std": 0.5482946634292603, "advantages/var": 0.3006270379450058, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 0.6787620064034151, "grad_norm": 0.25223176431793914, "learning_rate": 9.560448344738409e-07, "loss": 0.0, "num_tokens": 26318338.0, "reward": 0.7265625, "reward_std": 0.1369110345840454, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 159 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917699002625455e-09, "advantages/std": 0.46758541464805603, "advantages/var": 0.2186361199915945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 0.6830309498399146, "grad_norm": 0.23792049639198487, "learning_rate": 9.554951097530832e-07, "loss": 0.0, "num_tokens": 26439011.0, "reward": 0.8203125, "reward_std": 0.09863808751106262, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 160 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.48964112740099e-09, "advantages/std": 0.4675987958908081, "advantages/var": 0.21864863391853362, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.687299893276414, "grad_norm": 0.17305993617314033, "learning_rate": 9.549421288593157e-07, "loss": -0.0, "num_tokens": 26599069.0, "reward": 0.62890625, "reward_std": 0.11112815886735916, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 161 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131161717523846e-10, "advantages/std": 0.5726872682571411, "advantages/var": 0.3279707072238267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.6915688367129136, "grad_norm": 0.1947637979578463, "learning_rate": 9.543858957456025e-07, "loss": 0.0, "num_tokens": 26749602.0, "reward": 0.74609375, "reward_std": 0.1643950194120407, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 162 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.4896779409655654e-09, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 0.695837780149413, "grad_norm": 0.17657640815068512, "learning_rate": 9.53826414388257e-07, "loss": 0.0, "num_tokens": 26912977.0, "reward": 0.64453125, "reward_std": 0.10376540571451187, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 163 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5624695056307517e-09, "advantages/std": 0.5960580706596375, "advantages/var": 0.35528522359848935, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 0.7001067235859125, "grad_norm": 0.3028446810703845, "learning_rate": 9.532636887868132e-07, "loss": -0.0, "num_tokens": 27055838.0, "reward": 0.7109375, "reward_std": 0.16124600172042847, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 164 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.511192125942753e-09, "advantages/std": 0.49596524238586426, "advantages/var": 0.24598152165486908, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.7043756670224119, "grad_norm": 0.24471970204004162, "learning_rate": 9.526977229639965e-07, "loss": 0.0, "num_tokens": 27206332.0, "reward": 0.70703125, "reward_std": 0.12677115201950073, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 165 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 7.041900874137961e-09, "advantages/std": 0.49595409631729126, "advantages/var": 0.24597046565390102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 0.7086446104589115, "grad_norm": 0.21906254322665267, "learning_rate": 9.521285209656962e-07, "loss": -0.0, "num_tokens": 27351069.0, "reward": 0.6328125, "reward_std": 0.11481393873691559, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 166 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.812142072827069e-10, "advantages/std": 0.5960737466812134, "advantages/var": 0.35530391148257934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.7129135538954109, "grad_norm": 0.2553570949745252, "learning_rate": 9.515560868609352e-07, "loss": 0.0, "num_tokens": 27492145.0, "reward": 0.8046875, "reward_std": 0.18056842684745789, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 167 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.387563197817429e-09, "advantages/std": 0.618579089641571, "advantages/var": 0.3826400901417948, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.7171824973319103, "grad_norm": 0.23929974194308415, "learning_rate": 9.509804247418421e-07, "loss": -0.0, "num_tokens": 27659931.0, "reward": 0.625, "reward_std": 0.1990984082221985, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4850712716579437, "step": 168 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.520259612560154e-09, "advantages/std": 0.5483072400093079, "advantages/var": 0.30064082944662474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 0.7214514407684098, "grad_norm": 0.2769979980823483, "learning_rate": 9.504015387236213e-07, "loss": -0.0, "num_tokens": 27801772.0, "reward": 0.796875, "reward_std": 0.15110857784748077, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 169 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.272636100449617e-10, "advantages/std": 0.6402922868728638, "advantages/var": 0.4099742126288817, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.7257203842049093, "grad_norm": 0.29082131808158546, "learning_rate": 9.498194329445234e-07, "loss": -0.0, "num_tokens": 27973134.0, "reward": 0.53125, "reward_std": 0.21686306595802307, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.5, "step": 170 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629763089128757e-09, "advantages/std": 0.5227778553962708, "advantages/var": 0.2732966860927242, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.7299893276414088, "grad_norm": 0.24172170954180977, "learning_rate": 9.492341115658165e-07, "loss": 0.0, "num_tokens": 28131313.0, "reward": 0.796875, "reward_std": 0.12468298524618149, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 171 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5960737466812134, "advantages/var": 0.35530391148257934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 0.7342582710779082, "grad_norm": 0.24724705785330842, "learning_rate": 9.486455787717555e-07, "loss": 0.0, "num_tokens": 28314443.0, "reward": 0.6171875, "reward_std": 0.18056842684745789, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 172 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.979316211871183e-10, "advantages/std": 0.4675956070423126, "advantages/var": 0.21864565172526884, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.7385272145144077, "grad_norm": 0.14575457358998883, "learning_rate": 9.480538387695524e-07, "loss": 0.0, "num_tokens": 28486372.0, "reward": 0.671875, "reward_std": 0.10877159237861633, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 173 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.225042778153291e-09, "advantages/std": 0.4959655702114105, "advantages/var": 0.24598184683512958, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.7427961579509071, "grad_norm": 0.30035164580113827, "learning_rate": 9.47458895789347e-07, "loss": 0.0, "num_tokens": 28638341.0, "reward": 0.71875, "reward_std": 0.12730157375335693, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 174 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.1292017686950893e-09, "advantages/std": 0.618571400642395, "advantages/var": 0.3826305776926944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "epoch": 0.7470651013874067, "grad_norm": 0.26582968832894643, "learning_rate": 9.468607540841753e-07, "loss": 0.0, "num_tokens": 28802994.0, "reward": 0.62109375, "reward_std": 0.18937908113002777, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 175 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 1.1133803351426107e-08, "advantages/std": 0.5228012204170227, "advantages/var": 0.27332111606952836, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 0.7513340448239061, "grad_norm": 0.15052650271144502, "learning_rate": 9.462594179299405e-07, "loss": 0.0, "num_tokens": 28970090.0, "reward": 0.71875, "reward_std": 0.14860230684280396, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 176 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.1292782847223813e-09, "advantages/std": 0.4373888373374939, "advantages/var": 0.1913089950274447, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 0.7556029882604055, "grad_norm": 0.15101127830494457, "learning_rate": 9.456548916253814e-07, "loss": -0.0, "num_tokens": 29133620.0, "reward": 0.640625, "reward_std": 0.0875919908285141, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 177 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.6348574440747036e-09, "advantages/std": 0.6185588836669922, "advantages/var": 0.3826150925633556, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.759871931696905, "grad_norm": 0.23053809270745587, "learning_rate": 9.450471794920424e-07, "loss": -0.0, "num_tokens": 29314272.0, "reward": 0.58984375, "reward_std": 0.1745324432849884, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 178 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.661603659212928e-09, "advantages/std": 0.4373878836631775, "advantages/var": 0.1913081607753533, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 0.7641408751334045, "grad_norm": 0.24006538352367196, "learning_rate": 9.444362858742416e-07, "loss": -0.0, "num_tokens": 29452831.0, "reward": 0.7578125, "reward_std": 0.08811995387077332, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 179 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.7604386253420069e-09, "advantages/std": 0.6612858772277832, "advantages/var": 0.43729901142091876, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 0.768409818569904, "grad_norm": 0.26441195567315334, "learning_rate": 9.438222151390412e-07, "loss": -0.0, "num_tokens": 29616682.0, "reward": 0.640625, "reward_std": 0.22290541231632233, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 180 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.286176964687214e-09, "advantages/std": 0.49596065282821655, "advantages/var": 0.24597696915379075, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 0.7726787620064034, "grad_norm": 0.22092294807165377, "learning_rate": 9.432049716762149e-07, "loss": -0.0, "num_tokens": 29766007.0, "reward": 0.71875, "reward_std": 0.12217669934034348, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 181 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.527988578946327e-10, "advantages/std": 0.6185733079910278, "advantages/var": 0.382632937358963, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.7769477054429029, "grad_norm": 0.24586197971513324, "learning_rate": 9.425845598982176e-07, "loss": -0.0, "num_tokens": 29912855.0, "reward": 0.72265625, "reward_std": 0.1910889893770218, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 182 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.6614848777545554e-09, "advantages/std": 0.43740740418434143, "advantages/var": 0.19132523723528383, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.7812166488794023, "grad_norm": 0.1573793941424989, "learning_rate": 9.419609842401528e-07, "loss": -0.0, "num_tokens": 30075024.0, "reward": 0.6640625, "reward_std": 0.10520447790622711, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 183 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.2596858417853543e-09, "advantages/std": 0.36966460943222046, "advantages/var": 0.1366519234666761, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.7854855923159018, "grad_norm": 0.15453159134925487, "learning_rate": 9.413342491597418e-07, "loss": -0.0, "num_tokens": 30234875.0, "reward": 0.71484375, "reward_std": 0.0665532797574997, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 184 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.246388168920893e-10, "advantages/std": 0.5483027696609497, "advantages/var": 0.30063592721786847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.7897545357524013, "grad_norm": 0.2528426577376825, "learning_rate": 9.407043591372916e-07, "loss": 0.0, "num_tokens": 30385131.0, "reward": 0.734375, "reward_std": 0.1459837108850479, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 185 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.944999661654864e-09, "advantages/std": 0.5482975840568542, "advantages/var": 0.30063024068258315, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 0.7940234791889007, "grad_norm": 0.2395271449170319, "learning_rate": 9.400713186756623e-07, "loss": 0.0, "num_tokens": 30550599.0, "reward": 0.703125, "reward_std": 0.14138680696487427, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 186 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.0112142916763455e-09, "advantages/std": 0.6185694336891174, "advantages/var": 0.38262814429447545, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 0.7982924226254002, "grad_norm": 0.22912614754777264, "learning_rate": 9.39435132300236e-07, "loss": 0.0, "num_tokens": 30731714.0, "reward": 0.60546875, "reward_std": 0.18596413731575012, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 187 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.492741255292344e-09, "advantages/std": 0.5483050346374512, "advantages/var": 0.30063841100877653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 0.8025613660618997, "grad_norm": 0.2553756632167738, "learning_rate": 9.387958045588835e-07, "loss": 0.0, "num_tokens": 30884902.0, "reward": 0.6953125, "reward_std": 0.14939865469932556, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 188 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.3236407809283777e-09, "advantages/std": 0.7014055252075195, "advantages/var": 0.4919697107916363, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.484375, "epoch": 0.8068303094983992, "grad_norm": 0.24399140874857214, "learning_rate": 9.381533400219317e-07, "loss": 0.0, "num_tokens": 31067486.0, "reward": 0.58203125, "reward_std": 0.2597304582595825, "rewards/drgrpo_math_reward/mean": 0.58203125, "rewards/drgrpo_math_reward/std": 0.49419113993644714, "step": 189 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196689260745064e-09, "advantages/std": 0.5726897716522217, "advantages/var": 0.3279735745550738, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 0.8110992529348986, "grad_norm": 0.26704959527215627, "learning_rate": 9.375077432821321e-07, "loss": 0.0, "num_tokens": 31212571.0, "reward": 0.6328125, "reward_std": 0.16834037005901337, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 190 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907071115042357e-10, "advantages/std": 0.522799551486969, "advantages/var": 0.27331937103497594, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 0.8153681963713981, "grad_norm": 0.253644840740441, "learning_rate": 9.368590189546267e-07, "loss": -0.0, "num_tokens": 31362826.0, "reward": 0.7109375, "reward_std": 0.14795321226119995, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 191 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.9530591491397785e-09, "advantages/std": 0.5960665345191956, "advantages/var": 0.35529531357372335, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.8196371398078975, "grad_norm": 0.2227309105410048, "learning_rate": 9.362071716769158e-07, "loss": -0.0, "num_tokens": 31535677.0, "reward": 0.60546875, "reward_std": 0.1720261573791504, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 192 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 4.72739521502879e-09, "advantages/std": 0.6402676701545715, "advantages/var": 0.4099426894451632, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.823906083244397, "grad_norm": 0.2681728150646929, "learning_rate": 9.355522061088241e-07, "loss": 0.0, "num_tokens": 31704414.0, "reward": 0.6015625, "reward_std": 0.1845201551914215, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4905354380607605, "step": 193 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.2853073748283135e-09, "advantages/std": 0.5726816058158875, "advantages/var": 0.3279642216398635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.8281750266808965, "grad_norm": 0.25600457019311335, "learning_rate": 9.348941269324686e-07, "loss": -0.0, "num_tokens": 31871378.0, "reward": 0.62109375, "reward_std": 0.15703225135803223, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 194 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.898922066460967e-09, "advantages/std": 0.5227960348129272, "advantages/var": 0.27331569401611944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.8324439701173959, "grad_norm": 0.17012108669775766, "learning_rate": 9.342329388522237e-07, "loss": -0.0, "num_tokens": 32037929.0, "reward": 0.66796875, "reward_std": 0.14453580975532532, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 195 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.4686569410831624e-09, "advantages/std": 0.5960565805435181, "advantages/var": 0.35528344720923144, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.8367129135538954, "grad_norm": 0.31281434138995456, "learning_rate": 9.335686465946886e-07, "loss": 0.0, "num_tokens": 32174851.0, "reward": 0.7734375, "reward_std": 0.16059692203998566, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 196 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.7639801445415536e-10, "advantages/std": 0.6185756325721741, "advantages/var": 0.3826358132120653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.8409818569903948, "grad_norm": 0.24160142918946306, "learning_rate": 9.32901254908653e-07, "loss": 0.0, "num_tokens": 32342089.0, "reward": 0.7109375, "reward_std": 0.19503435492515564, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 197 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.3361013392583426e-09, "advantages/std": 0.522783637046814, "advantages/var": 0.2733027311638949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 0.8452508004268944, "grad_norm": 0.21258248842995997, "learning_rate": 9.322307685650637e-07, "loss": 0.0, "num_tokens": 32498458.0, "reward": 0.69921875, "reward_std": 0.12981030344963074, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 198 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.246439870363116e-09, "advantages/std": 0.5482960939407349, "advantages/var": 0.30062860663066715, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 0.8495197438633938, "grad_norm": 0.20952831368827907, "learning_rate": 9.315571923569892e-07, "loss": -0.0, "num_tokens": 32642747.0, "reward": 0.7109375, "reward_std": 0.13914892077445984, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 199 }, { "advantages/mean": -6.752088665962219e-09, "advantages/snr": 1.0210589122953355e-08, "advantages/std": 0.6612829566001892, "advantages/var": 0.4372951486898877, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.8537886872998933, "grad_norm": 0.30185620500016624, "learning_rate": 9.308805310995875e-07, "loss": 0.0, "num_tokens": 32794582.0, "reward": 0.73828125, "reward_std": 0.21937178075313568, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 200 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.734251092552386e-09, "advantages/std": 0.5960734486579895, "advantages/var": 0.35530355619502885, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.8580576307363927, "grad_norm": 0.2462816018228872, "learning_rate": 9.302007896300697e-07, "loss": -0.0, "num_tokens": 32961774.0, "reward": 0.63671875, "reward_std": 0.18003800511360168, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 201 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599654419242252e-09, "advantages/std": 0.404953271150589, "advantages/var": 0.16398715181556245, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 0.8623265741728922, "grad_norm": 0.16802665427080618, "learning_rate": 9.295179728076665e-07, "loss": 0.0, "num_tokens": 33116837.0, "reward": 0.72265625, "reward_std": 0.08390620350837708, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 202 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.516864526265062e-09, "advantages/std": 0.6185635328292847, "advantages/var": 0.38262084414624553, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.8665955176093917, "grad_norm": 0.25804002111389396, "learning_rate": 9.288320855135934e-07, "loss": -0.0, "num_tokens": 33286278.0, "reward": 0.61328125, "reward_std": 0.17912934720516205, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 203 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.991711378624568e-09, "advantages/std": 0.46759915351867676, "advantages/var": 0.21864896837138303, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 0.8708644610458911, "grad_norm": 0.22827939959255067, "learning_rate": 9.281431326510152e-07, "loss": -0.0, "num_tokens": 33443163.0, "reward": 0.6640625, "reward_std": 0.11165857315063477, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 204 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.0310319200493365e-09, "advantages/std": 0.5960649251937866, "advantages/var": 0.35529339504627444, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.8751334044823906, "grad_norm": 0.22458008637077748, "learning_rate": 9.274511191450119e-07, "loss": -0.0, "num_tokens": 33610355.0, "reward": 0.6875, "reward_std": 0.16925784945487976, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 205 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.643394011345789e-09, "advantages/std": 0.5483102798461914, "advantages/var": 0.30064416298500873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.87940234791889, "grad_norm": 0.2290235050594148, "learning_rate": 9.267560499425424e-07, "loss": 0.0, "num_tokens": 33762863.0, "reward": 0.7734375, "reward_std": 0.1539955586194992, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 206 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 7.514663704136494e-09, "advantages/std": 0.681637167930603, "advantages/var": 0.4646292287044531, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.8836712913553896, "grad_norm": 0.2785376078782206, "learning_rate": 9.260579300124099e-07, "loss": 0.0, "num_tokens": 33917344.0, "reward": 0.671875, "reward_std": 0.23672226071357727, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 207 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.812217066674933e-10, "advantages/std": 0.5960680246353149, "advantages/var": 0.3552970899926464, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.887940234791889, "grad_norm": 0.29013235358343903, "learning_rate": 9.253567643452262e-07, "loss": 0.0, "num_tokens": 34092208.0, "reward": 0.59765625, "reward_std": 0.1726752519607544, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4913311004638672, "step": 208 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449885533156584e-09, "advantages/std": 0.404936283826828, "advantages/var": 0.1639733939594814, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 0.8922091782283885, "grad_norm": 0.2085125843665539, "learning_rate": 9.246525579533764e-07, "loss": 0.0, "num_tokens": 34244162.0, "reward": 0.64453125, "reward_std": 0.06970865279436111, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 209 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5483095049858093, "advantages/var": 0.30064331325778326, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.8964781216648879, "grad_norm": 0.18475872134993088, "learning_rate": 9.23945315870982e-07, "loss": 0.0, "num_tokens": 34403014.0, "reward": 0.6484375, "reward_std": 0.15452352166175842, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 210 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.859129990066525e-09, "advantages/std": 0.5960713624954224, "advantages/var": 0.3553010691871492, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.9007470651013874, "grad_norm": 0.29702733030568923, "learning_rate": 9.232350431538656e-07, "loss": 0.0, "num_tokens": 34564823.0, "reward": 0.67578125, "reward_std": 0.17662307620048523, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 211 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5624249773879439e-09, "advantages/std": 0.5960750579833984, "advantages/var": 0.3553054747499118, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 0.9050160085378869, "grad_norm": 0.21062195720897037, "learning_rate": 9.225217448795154e-07, "loss": 0.0, "num_tokens": 34741244.0, "reward": 0.6640625, "reward_std": 0.18280631303787231, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 212 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3306388556957245, "advantages/var": 0.10932205289577812, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 0.9092849519743863, "grad_norm": 0.14741888821412977, "learning_rate": 9.218054261470476e-07, "loss": 0.0, "num_tokens": 34873153.0, "reward": 0.796875, "reward_std": 0.05326685309410095, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 213 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.453650819722754e-09, "advantages/std": 0.522786021232605, "advantages/var": 0.2733052239962177, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.9135538954108858, "grad_norm": 0.15938017835116514, "learning_rate": 9.210860920771705e-07, "loss": -0.0, "num_tokens": 35048090.0, "reward": 0.60546875, "reward_std": 0.13322526216506958, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 214 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.492832655171976e-10, "advantages/std": 0.5482991337776184, "advantages/var": 0.3006319401012867, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 0.9178228388473852, "grad_norm": 0.22427508241794308, "learning_rate": 9.203637478121491e-07, "loss": 0.0, "num_tokens": 35218953.0, "reward": 0.6015625, "reward_std": 0.14203590154647827, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4905354380607605, "step": 215 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 7.151580191519881e-09, "advantages/std": 0.6185740828514099, "advantages/var": 0.38263389597546293, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 0.9220917822838848, "grad_norm": 0.20779930017559525, "learning_rate": 9.196383985157656e-07, "loss": 0.0, "num_tokens": 35366948.0, "reward": 0.78515625, "reward_std": 0.1922660619020462, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 216 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2739017715075114e-09, "advantages/std": 0.5483090877532959, "advantages/var": 0.30064285571285154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 0.9263607257203842, "grad_norm": 0.22856666349627422, "learning_rate": 9.189100493732851e-07, "loss": -0.0, "num_tokens": 35510655.0, "reward": 0.77734375, "reward_std": 0.15228809416294098, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 217 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.3971400787152124e-09, "advantages/std": 0.5482980012893677, "advantages/var": 0.30063069821791544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 0.9306296691568837, "grad_norm": 0.22475360749870224, "learning_rate": 9.181787055914175e-07, "loss": 0.0, "num_tokens": 35667550.0, "reward": 0.48046875, "reward_std": 0.14032843708992004, "rewards/drgrpo_math_reward/mean": 0.48046875, "rewards/drgrpo_math_reward/std": 0.5005971193313599, "step": 218 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.786289910659712e-09, "advantages/std": 0.6185793280601501, "advantages/var": 0.38264038510334686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 0.9348986125933831, "grad_norm": 0.244956248158164, "learning_rate": 9.174443723982799e-07, "loss": 0.0, "num_tokens": 35833973.0, "reward": 0.65234375, "reward_std": 0.1996288150548935, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 219 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 7.636313400668574e-09, "advantages/std": 0.6402884721755981, "advantages/var": 0.4099693276009617, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 0.9391675560298826, "grad_norm": 0.24276848059353862, "learning_rate": 9.167070550433602e-07, "loss": 0.0, "num_tokens": 35997006.0, "reward": 0.703125, "reward_std": 0.21173818409442902, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 220 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022429163667084e-09, "advantages/std": 0.6185693740844727, "advantages/var": 0.38262807055525627, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.9434364994663821, "grad_norm": 0.23098088315813128, "learning_rate": 9.159667587974785e-07, "loss": 0.0, "num_tokens": 36144820.0, "reward": 0.73828125, "reward_std": 0.18596413731575012, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 221 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755638038437843e-09, "advantages/std": 0.49595969915390015, "advantages/var": 0.24597602318482714, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.9477054429028815, "grad_norm": 0.203738257085365, "learning_rate": 9.152234889527501e-07, "loss": -0.0, "num_tokens": 36302905.0, "reward": 0.7265625, "reward_std": 0.12099964916706085, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 222 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.528071999011651e-10, "advantages/std": 0.6185664534568787, "advantages/var": 0.38262445734222084, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.951974386339381, "grad_norm": 0.2859031051149433, "learning_rate": 9.144772508225476e-07, "loss": -0.0, "num_tokens": 36465340.0, "reward": 0.609375, "reward_std": 0.1825467348098755, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 223 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 4.893158628587196e-09, "advantages/std": 0.6185775995254517, "advantages/var": 0.38263824663467005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.9562433297758804, "grad_norm": 0.33728645483097885, "learning_rate": 9.137280497414628e-07, "loss": 0.0, "num_tokens": 36622798.0, "reward": 0.71875, "reward_std": 0.19674429297447205, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 224 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.269580834375063e-09, "advantages/std": 0.6185746192932129, "advantages/var": 0.38263455963374327, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 0.96051227321238, "grad_norm": 0.23678336081671283, "learning_rate": 9.129758910652683e-07, "loss": 0.0, "num_tokens": 36787184.0, "reward": 0.65234375, "reward_std": 0.1933268904685974, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 225 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.5478913430760094e-09, "advantages/std": 0.5482901930809021, "advantages/var": 0.3006221358286929, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.9647812166488794, "grad_norm": 0.23416868197278523, "learning_rate": 9.122207801708801e-07, "loss": -0.0, "num_tokens": 36934154.0, "reward": 0.703125, "reward_std": 0.13178616762161255, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 226 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5624437258124148e-09, "advantages/std": 0.5960679054260254, "advantages/var": 0.35529694787896915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.9690501600853789, "grad_norm": 0.24178276531312257, "learning_rate": 9.114627224563181e-07, "loss": 0.0, "num_tokens": 37097501.0, "reward": 0.71484375, "reward_std": 0.17097023129463196, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 227 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.389007647043238e-10, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.9733191035218783, "grad_norm": 0.21515667717388337, "learning_rate": 9.10701723340668e-07, "loss": 0.0, "num_tokens": 37255645.0, "reward": 0.81640625, "reward_std": 0.1255940943956375, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 228 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.972471718244625e-09, "advantages/std": 0.5483027696609497, "advantages/var": 0.30063592721786847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 0.9775880469583778, "grad_norm": 0.2875591708504233, "learning_rate": 9.099377882640424e-07, "loss": 0.0, "num_tokens": 37391494.0, "reward": 0.8046875, "reward_std": 0.1459837108850479, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 229 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 6.3695981791179415e-09, "advantages/std": 0.5483013987541199, "advantages/var": 0.30063442387572437, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "epoch": 0.9818569903948773, "grad_norm": 0.23429010911438367, "learning_rate": 9.091709226875428e-07, "loss": -0.0, "num_tokens": 37564614.0, "reward": 0.609375, "reward_std": 0.14545084536075592, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 230 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.2861422109191467e-09, "advantages/std": 0.4959658980369568, "advantages/var": 0.24598217201560502, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.9861259338313767, "grad_norm": 0.21210491205337761, "learning_rate": 9.084011320932188e-07, "loss": -0.0, "num_tokens": 37722632.0, "reward": 0.63671875, "reward_std": 0.12783199548721313, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 231 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.6364006470548067e-10, "advantages/std": 0.6402777433395386, "advantages/var": 0.40995558861597203, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 0.9903948772678762, "grad_norm": 0.3265367211352305, "learning_rate": 9.076284219840304e-07, "loss": -0.0, "num_tokens": 37898519.0, "reward": 0.53125, "reward_std": 0.19754064083099365, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.5, "step": 232 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 1.1178199929019476e-08, "advantages/std": 0.43740883469581604, "advantages/var": 0.19132648866995172, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 0.9946638207043756, "grad_norm": 0.20031446384378207, "learning_rate": 9.068527978838084e-07, "loss": 0.0, "num_tokens": 38052365.0, "reward": 0.73828125, "reward_std": 0.10691195726394653, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 233 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.919882841693015e-09, "advantages/std": 0.4374060034751892, "advantages/var": 0.19132401187613723, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 0.9989327641408752, "grad_norm": 0.18502191003874308, "learning_rate": 9.060742653372142e-07, "loss": -0.0, "num_tokens": 38203481.0, "reward": 0.70703125, "reward_std": 0.10349701344966888, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 234 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.572685182094574, "advantages/var": 0.32796831779069535, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.0042689434364995, "grad_norm": 0.2681599551540745, "learning_rate": 9.052928299097012e-07, "loss": -0.0, "num_tokens": 38362882.0, "reward": 0.62109375, "reward_std": 0.1626850962638855, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 235 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.9724594393523208e-09, "advantages/std": 0.5483050346374512, "advantages/var": 0.30063841100877653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.0085378868729988, "grad_norm": 0.23210538289797444, "learning_rate": 9.045084971874737e-07, "loss": 0.0, "num_tokens": 38502567.0, "reward": 0.796875, "reward_std": 0.14939865469932556, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 236 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.8181795048296057e-09, "advantages/std": 0.640285074710846, "advantages/var": 0.4099649768974736, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.0128068303094984, "grad_norm": 0.2611427222928886, "learning_rate": 9.037212727774485e-07, "loss": 0.0, "num_tokens": 38659097.0, "reward": 0.68359375, "reward_std": 0.20725995302200317, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 237 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814572812374478e-09, "advantages/std": 0.5227869153022766, "advantages/var": 0.27330615881126974, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 1.017075773745998, "grad_norm": 0.19890422356269907, "learning_rate": 9.029311623072137e-07, "loss": -0.0, "num_tokens": 38828844.0, "reward": 0.67578125, "reward_std": 0.1344023048877716, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 238 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.944765720258187e-09, "advantages/std": 0.5483191609382629, "advantages/var": 0.3006539022520407, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.0213447171824974, "grad_norm": 0.2278453686272756, "learning_rate": 9.021381714249887e-07, "loss": 0.0, "num_tokens": 38990118.0, "reward": 0.7734375, "reward_std": 0.16424530744552612, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 239 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5624385699508314e-09, "advantages/std": 0.596069872379303, "advantages/var": 0.35529929275827854, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.0256136606189967, "grad_norm": 0.2542975337210786, "learning_rate": 9.013423057995844e-07, "loss": -0.0, "num_tokens": 39162750.0, "reward": 0.62890625, "reward_std": 0.17597398161888123, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 240 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.12489245127268e-09, "advantages/std": 0.596066951751709, "advantages/var": 0.35529581097057417, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.0298826040554963, "grad_norm": 0.23443867005042662, "learning_rate": 9.005435711203618e-07, "loss": 0.0, "num_tokens": 39316222.0, "reward": 0.7890625, "reward_std": 0.17096778750419617, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 241 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.812088953055965e-10, "advantages/std": 0.5960777997970581, "advantages/var": 0.3553087434109017, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.0341515474919958, "grad_norm": 0.23796595572080942, "learning_rate": 8.997419730971916e-07, "loss": -0.0, "num_tokens": 39464279.0, "reward": 0.7265625, "reward_std": 0.18569329380989075, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 242 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1231797745105085e-09, "advantages/std": 0.5483064651489258, "advantages/var": 0.30063997972411016, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.038420490928495, "grad_norm": 0.22286596877086656, "learning_rate": 8.989375174604141e-07, "loss": 0.0, "num_tokens": 39621864.0, "reward": 0.6484375, "reward_std": 0.15163654088974, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 243 }, { "advantages/mean": -5.3551048040390015e-09, "advantages/snr": 1.0797358794099725e-08, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 1.0426894343649946, "grad_norm": 0.21043020237501475, "learning_rate": 8.981302099607972e-07, "loss": 0.0, "num_tokens": 39785296.0, "reward": 0.73828125, "reward_std": 0.1255940943956375, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 244 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2267840268747123e-09, "advantages/std": 0.5227957367897034, "advantages/var": 0.2733153824054888, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.0469583778014941, "grad_norm": 0.19849666793713738, "learning_rate": 8.973200563694963e-07, "loss": -0.0, "num_tokens": 39966617.0, "reward": 0.640625, "reward_std": 0.14400538802146912, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 245 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175644599137753e-09, "advantages/std": 0.5227845311164856, "advantages/var": 0.2733036659746837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.0512273212379937, "grad_norm": 0.2371693296860179, "learning_rate": 8.965070624780115e-07, "loss": 0.0, "num_tokens": 40132643.0, "reward": 0.64453125, "reward_std": 0.13098734617233276, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 246 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.2463106191180555e-10, "advantages/std": 0.548312783241272, "advantages/var": 0.3006469082657901, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.055496264674493, "grad_norm": 0.23997233729464268, "learning_rate": 8.956912340981484e-07, "loss": -0.0, "num_tokens": 40283074.0, "reward": 0.72265625, "reward_std": 0.15623590350151062, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 247 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.971350610904034e-09, "advantages/std": 0.46757495403289795, "advantages/var": 0.21862633763886663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.0597652081109925, "grad_norm": 0.2525168510699522, "learning_rate": 8.948725770619744e-07, "loss": 0.0, "num_tokens": 40423442.0, "reward": 0.7578125, "reward_std": 0.0883883461356163, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 248 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907167588560933e-10, "advantages/std": 0.5227938890457153, "advantages/var": 0.2733134504235437, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.064034151547492, "grad_norm": 0.19726712226980916, "learning_rate": 8.940510972217785e-07, "loss": -0.0, "num_tokens": 40587006.0, "reward": 0.71484375, "reward_std": 0.14123709499835968, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 249 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 9.69288060329602e-10, "advantages/std": 0.7206236720085144, "advantages/var": 0.519298476659035, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.0683030949839916, "grad_norm": 0.2883836841361683, "learning_rate": 8.932268004500287e-07, "loss": -0.0, "num_tokens": 40751302.0, "reward": 0.72265625, "reward_std": 0.2730144262313843, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 250 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.095673003899881e-09, "advantages/std": 0.5483019948005676, "advantages/var": 0.3006350775022817, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.0725720384204909, "grad_norm": 0.2146721541501632, "learning_rate": 8.923996926393305e-07, "loss": -0.0, "num_tokens": 40902937.0, "reward": 0.6796875, "reward_std": 0.14651167392730713, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 251 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987671798143373e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.0768409818569904, "grad_norm": 0.20976273414921642, "learning_rate": 8.91569779702384e-07, "loss": -0.0, "num_tokens": 41060790.0, "reward": 0.671875, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 252 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.6918481539330074e-09, "advantages/std": 0.5726837515830994, "advantages/var": 0.32796667932729306, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.08110992529349, "grad_norm": 0.2813986497841566, "learning_rate": 8.907370675719427e-07, "loss": 0.0, "num_tokens": 41208791.0, "reward": 0.69921875, "reward_std": 0.16044719517230988, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 253 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.9723957846157257e-09, "advantages/std": 0.5483167767524719, "advantages/var": 0.30065128766822014, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.0853788687299892, "grad_norm": 0.2259849805731124, "learning_rate": 8.899015622007702e-07, "loss": -0.0, "num_tokens": 41363716.0, "reward": 0.640625, "reward_std": 0.1624191403388977, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 254 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.408345822654131e-09, "advantages/std": 0.6612882614135742, "advantages/var": 0.4373021646833877, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.0896478121664888, "grad_norm": 0.2887201039560959, "learning_rate": 8.890632695615982e-07, "loss": -0.0, "num_tokens": 41534919.0, "reward": 0.5859375, "reward_std": 0.22738116979599, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.4935242533683777, "step": 255 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175854313291422e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.0939167556029883, "grad_norm": 0.19084207845075485, "learning_rate": 8.882221956470836e-07, "loss": 0.0, "num_tokens": 41699529.0, "reward": 0.640625, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 256 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.645987625947185e-09, "advantages/std": 0.6185737252235413, "advantages/var": 0.3826334535369291, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.0981856990394878, "grad_norm": 0.32427544566232364, "learning_rate": 8.873783464697653e-07, "loss": 0.0, "num_tokens": 41857360.0, "reward": 0.6875, "reward_std": 0.19332444667816162, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 257 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.906072989360551e-09, "advantages/std": 0.5960734486579895, "advantages/var": 0.35530355619502885, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.1024546424759871, "grad_norm": 0.26379798941422533, "learning_rate": 8.865317280620219e-07, "loss": 0.0, "num_tokens": 42022785.0, "reward": 0.64453125, "reward_std": 0.18003800511360168, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 258 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 2.987613550456549e-09, "advantages/std": 0.7013878226280212, "advantages/var": 0.4919448777308766, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.1067235859124867, "grad_norm": 0.344142203779848, "learning_rate": 8.856823464760282e-07, "loss": -0.0, "num_tokens": 42171320.0, "reward": 0.625, "reward_std": 0.233575701713562, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4850712716579437, "step": 259 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196938070589645e-09, "advantages/std": 0.5726780891418457, "advantages/var": 0.3279601937831558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 1.1109925293489862, "grad_norm": 0.23515318491841494, "learning_rate": 8.84830207783712e-07, "loss": 0.0, "num_tokens": 42335796.0, "reward": 0.55078125, "reward_std": 0.1530844271183014, "rewards/drgrpo_math_reward/mean": 0.55078125, "rewards/drgrpo_math_reward/std": 0.49838894605636597, "step": 260 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.90728742173617e-10, "advantages/std": 0.5227868556976318, "advantages/var": 0.27330609649021653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.1152614727854857, "grad_norm": 0.2014522180480839, "learning_rate": 8.839753180767107e-07, "loss": -0.0, "num_tokens": 42502750.0, "reward": 0.70703125, "reward_std": 0.13269728422164917, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 261 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.2196931723273553e-08, "advantages/std": 0.5726783871650696, "advantages/var": 0.32796053512598533, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.119530416221985, "grad_norm": 0.2783647958076793, "learning_rate": 8.831176834663273e-07, "loss": 0.0, "num_tokens": 42666009.0, "reward": 0.640625, "reward_std": 0.1536148637533188, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 262 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6722203494045257e-09, "advantages/std": 0.522780179977417, "advantages/var": 0.2732991165772205, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.1237993596584845, "grad_norm": 0.22229712386788078, "learning_rate": 8.822573100834879e-07, "loss": 0.0, "num_tokens": 42809029.0, "reward": 0.78125, "reward_std": 0.1263929009437561, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 263 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.1257210721944304e-09, "advantages/std": 0.5227948427200317, "advantages/var": 0.2733144475746627, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.128068303094984, "grad_norm": 0.21192413693613738, "learning_rate": 8.813942040786963e-07, "loss": -0.0, "num_tokens": 43000488.0, "reward": 0.546875, "reward_std": 0.1428283452987671, "rewards/drgrpo_math_reward/mean": 0.546875, "rewards/drgrpo_math_reward/std": 0.4987730085849762, "step": 264 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.7260719743060095e-09, "advantages/std": 0.4374082088470459, "advantages/var": 0.19132594116678092, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 1.1323372465314834, "grad_norm": 0.18260371159172073, "learning_rate": 8.805283716219915e-07, "loss": 0.0, "num_tokens": 43169988.0, "reward": 0.625, "reward_std": 0.10626532137393951, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4850712716579437, "step": 265 }, { "advantages/mean": -5.3551048040390015e-09, "advantages/snr": 8.984029862526269e-09, "advantages/std": 0.5960693359375, "advantages/var": 0.35529865324497223, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.136606189967983, "grad_norm": 0.24260356114505385, "learning_rate": 8.796598189029029e-07, "loss": 0.0, "num_tokens": 43331520.0, "reward": 0.72265625, "reward_std": 0.17491313815116882, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 266 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.789606785993915e-09, "advantages/std": 0.5227985978126526, "advantages/var": 0.2733183738748757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.1408751334044824, "grad_norm": 0.2635927347438612, "learning_rate": 8.787885521304055e-07, "loss": 0.0, "num_tokens": 43490313.0, "reward": 0.69140625, "reward_std": 0.14636196196079254, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 267 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 8.068112086076467e-09, "advantages/std": 0.5483044981956482, "advantages/var": 0.3006378227415816, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.1451440768409817, "grad_norm": 0.2815394159637722, "learning_rate": 8.779145775328764e-07, "loss": -0.0, "num_tokens": 43645813.0, "reward": 0.66796875, "reward_std": 0.14875200390815735, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 268 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.5477847097207762e-09, "advantages/std": 0.5483131408691406, "advantages/var": 0.30064730044978205, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.1494130202774813, "grad_norm": 0.2345865010716582, "learning_rate": 8.770379013580507e-07, "loss": -0.0, "num_tokens": 43806619.0, "reward": 0.734375, "reward_std": 0.15847133100032806, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 269 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.492707096246389e-10, "advantages/std": 0.5483072400093079, "advantages/var": 0.30064082944662474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.1536819637139808, "grad_norm": 0.24516543455701023, "learning_rate": 8.761585298729748e-07, "loss": 0.0, "num_tokens": 43962231.0, "reward": 0.703125, "reward_std": 0.15110856294631958, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 270 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979379684270509e-09, "advantages/std": 0.4675896465778351, "advantages/var": 0.21864007758678472, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.1579509071504803, "grad_norm": 0.24033704608392403, "learning_rate": 8.752764693639638e-07, "loss": 0.0, "num_tokens": 44114241.0, "reward": 0.671875, "reward_std": 0.10258589684963226, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 271 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.48964112740099e-09, "advantages/std": 0.4675987958908081, "advantages/var": 0.21864863391853362, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.1622198505869796, "grad_norm": 0.23283406072005608, "learning_rate": 8.743917261365557e-07, "loss": 0.0, "num_tokens": 44280233.0, "reward": 0.58984375, "reward_std": 0.11112815141677856, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 272 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.485510240816933e-09, "advantages/std": 0.467597097158432, "advantages/var": 0.2186470452709921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.1664887940234792, "grad_norm": 0.3827013434118944, "learning_rate": 8.73504306515466e-07, "loss": -0.0, "num_tokens": 44427455.0, "reward": 0.73828125, "reward_std": 0.10889027267694473, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 273 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.520296218227876e-09, "advantages/std": 0.5483036041259766, "advantages/var": 0.3006368422975356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.1707577374599787, "grad_norm": 0.2654311262205045, "learning_rate": 8.726142168445425e-07, "loss": 0.0, "num_tokens": 44576560.0, "reward": 0.6484375, "reward_std": 0.14716076850891113, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 274 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.5478470268314738e-09, "advantages/std": 0.5482997298240662, "advantages/var": 0.30063259372514395, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.1750266808964782, "grad_norm": 0.2816453673564088, "learning_rate": 8.717214634867211e-07, "loss": 0.0, "num_tokens": 44750347.0, "reward": 0.6015625, "reward_std": 0.14309673011302948, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4905354380607605, "step": 275 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.634804126955208e-09, "advantages/std": 0.618571400642395, "advantages/var": 0.3826305776926944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.1792956243329775, "grad_norm": 0.23661445655492572, "learning_rate": 8.708260528239788e-07, "loss": -0.0, "num_tokens": 44920222.0, "reward": 0.62109375, "reward_std": 0.18937908113002777, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 276 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.816750546713503e-09, "advantages/std": 0.6612744331359863, "advantages/var": 0.43728387591932005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.183564567769477, "grad_norm": 0.26084562938037026, "learning_rate": 8.699279912572888e-07, "loss": -0.0, "num_tokens": 45097288.0, "reward": 0.55859375, "reward_std": 0.20858919620513916, "rewards/drgrpo_math_reward/mean": 0.55859375, "rewards/drgrpo_math_reward/std": 0.4975275993347168, "step": 277 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.789625268215301e-09, "advantages/std": 0.5227969288825989, "advantages/var": 0.27331662884907715, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 1.1878335112059766, "grad_norm": 0.25195978563158833, "learning_rate": 8.690272852065748e-07, "loss": -0.0, "num_tokens": 45265747.0, "reward": 0.67578125, "reward_std": 0.14571286737918854, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 278 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344476650405274e-09, "advantages/std": 0.5227766633033752, "advantages/var": 0.27329543969461056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.1921024546424759, "grad_norm": 0.19475972227751492, "learning_rate": 8.68123941110665e-07, "loss": -0.0, "num_tokens": 45425884.0, "reward": 0.86328125, "reward_std": 0.12297550588846207, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.34422317147254944, "step": 279 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 1.0615910412769516e-08, "advantages/std": 0.548305869102478, "advantages/var": 0.30063932609222377, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 1.1963713980789754, "grad_norm": 0.26798133987683354, "learning_rate": 8.67217965427246e-07, "loss": 0.0, "num_tokens": 45569425.0, "reward": 0.84375, "reward_std": 0.15057571232318878, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3638034462928772, "step": 280 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.200640341515475, "grad_norm": 0.3170370623918755, "learning_rate": 8.663093646328166e-07, "loss": 0.0, "num_tokens": 45728046.0, "reward": 0.6953125, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 281 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344285320354163e-09, "advantages/std": 0.5227953791618347, "advantages/var": 0.2733150084729665, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.2049092849519745, "grad_norm": 0.24331646146386493, "learning_rate": 8.653981452226417e-07, "loss": 0.0, "num_tokens": 45873852.0, "reward": 0.76953125, "reward_std": 0.1434749811887741, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 282 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.285310125301036e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.2091782283884738, "grad_norm": 0.2142418838550912, "learning_rate": 8.644843137107057e-07, "loss": 0.0, "num_tokens": 46047250.0, "reward": 0.59375, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.49209436774253845, "step": 283 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.8553042319251684e-09, "advantages/std": 0.437404602766037, "advantages/var": 0.1913227865209146, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.2134471718249733, "grad_norm": 0.1783701475963193, "learning_rate": 8.635678766296661e-07, "loss": 0.0, "num_tokens": 46205057.0, "reward": 0.703125, "reward_std": 0.10178953409194946, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 284 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.5155159600246683e-09, "advantages/std": 0.5960649251937866, "advantages/var": 0.35529339504627444, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.2177161152614728, "grad_norm": 0.2221401615180632, "learning_rate": 8.626488405308066e-07, "loss": 0.0, "num_tokens": 46368520.0, "reward": 0.6640625, "reward_std": 0.16925784945487976, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 285 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196389686850863e-09, "advantages/std": 0.5727038383483887, "advantages/var": 0.3279896864589773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.2219850586979724, "grad_norm": 0.24138394163239055, "learning_rate": 8.617272119839902e-07, "loss": -0.0, "num_tokens": 46523291.0, "reward": 0.7109375, "reward_std": 0.18424785137176514, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 286 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.5478276389592506e-09, "advantages/std": 0.5483039021492004, "advantages/var": 0.30063716911203997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 1.2262540021344717, "grad_norm": 0.20263774043829955, "learning_rate": 8.608029975776128e-07, "loss": -0.0, "num_tokens": 46678953.0, "reward": 0.76171875, "reward_std": 0.14769119024276733, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 287 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917351169221575e-09, "advantages/std": 0.46759358048439026, "advantages/var": 0.21864375651021195, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.2305229455709712, "grad_norm": 0.21616322659681173, "learning_rate": 8.598762039185552e-07, "loss": 0.0, "num_tokens": 46830553.0, "reward": 0.79296875, "reward_std": 0.1060032919049263, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 288 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.898892464424674e-09, "advantages/std": 0.5227991938591003, "advantages/var": 0.2733189970997252, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.2347918890074707, "grad_norm": 0.19897774941835714, "learning_rate": 8.589468376321367e-07, "loss": -0.0, "num_tokens": 46992262.0, "reward": 0.66015625, "reward_std": 0.14742279052734375, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 289 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.9530138407463264e-09, "advantages/std": 0.5960803627967834, "advantages/var": 0.355311798911945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.23906083244397, "grad_norm": 0.24347332154396464, "learning_rate": 8.580149053620674e-07, "loss": 0.0, "num_tokens": 47158417.0, "reward": 0.67578125, "reward_std": 0.18804985284805298, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 290 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.8167024633676336e-09, "advantages/std": 0.49596428871154785, "advantages/var": 0.2459805756771516, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.2433297758804696, "grad_norm": 0.19247935669529417, "learning_rate": 8.570804137704003e-07, "loss": 0.0, "num_tokens": 47314092.0, "reward": 0.68359375, "reward_std": 0.1255940943956375, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 291 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.6615166122208e-09, "advantages/std": 0.4374021887779236, "advantages/var": 0.1913206747477183, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.247598719316969, "grad_norm": 0.1410680828176049, "learning_rate": 8.561433695374848e-07, "loss": 0.0, "num_tokens": 47475514.0, "reward": 0.80078125, "reward_std": 0.10061002522706985, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 292 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.6363810130421453e-09, "advantages/std": 0.6402812004089355, "advantages/var": 0.4099600155971075, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.2518676627534684, "grad_norm": 0.29802936592619206, "learning_rate": 8.552037793619175e-07, "loss": -0.0, "num_tokens": 47642570.0, "reward": 0.74609375, "reward_std": 0.20043008029460907, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 293 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.906044476527982e-09, "advantages/std": 0.5960777997970581, "advantages/var": 0.3553087434109017, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.256136606189968, "grad_norm": 0.27625707603096233, "learning_rate": 8.542616499604957e-07, "loss": 0.0, "num_tokens": 47807632.0, "reward": 0.6640625, "reward_std": 0.18569329380989075, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 294 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.2250712129826735e-09, "advantages/std": 0.4959622323513031, "advantages/var": 0.24597853591888796, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.2604055496264674, "grad_norm": 0.23483656229841748, "learning_rate": 8.533169880681681e-07, "loss": 0.0, "num_tokens": 47959296.0, "reward": 0.7578125, "reward_std": 0.1244145929813385, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 295 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 8.197904556506459e-09, "advantages/std": 0.6816297173500061, "advantages/var": 0.4646190715746492, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.264674493062967, "grad_norm": 0.24304630329493054, "learning_rate": 8.523698004379875e-07, "loss": 0.0, "num_tokens": 48133667.0, "reward": 0.68359375, "reward_std": 0.22594210505485535, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 296 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.8778028834468404e-09, "advantages/std": 0.4959639608860016, "advantages/var": 0.2459802504977313, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.2689434364994665, "grad_norm": 0.19984218005460322, "learning_rate": 8.514200938410627e-07, "loss": -0.0, "num_tokens": 48299805.0, "reward": 0.6796875, "reward_std": 0.1250636875629425, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 297 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524389602364913e-09, "advantages/std": 0.572691798210144, "advantages/var": 0.32797589573716834, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.2732123799359658, "grad_norm": 0.24154623696060273, "learning_rate": 8.504678750665093e-07, "loss": -0.0, "num_tokens": 48459104.0, "reward": 0.6484375, "reward_std": 0.17005029320716858, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 298 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.8786360989624384e-09, "advantages/std": 0.5726944208145142, "advantages/var": 0.32797889963207183, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.2774813233724653, "grad_norm": 0.23284752992750668, "learning_rate": 8.495131509214013e-07, "loss": 0.0, "num_tokens": 48614547.0, "reward": 0.65234375, "reward_std": 0.172406867146492, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 299 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.226759654792445e-09, "advantages/std": 0.5228014588356018, "advantages/var": 0.27332136536063345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.2817502668089649, "grad_norm": 0.2681174827752255, "learning_rate": 8.485559282307235e-07, "loss": 0.0, "num_tokens": 48774856.0, "reward": 0.72265625, "reward_std": 0.14913272857666016, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 300 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814442823074754e-09, "advantages/std": 0.5227907299995422, "advantages/var": 0.27331014737345427, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 1.2860192102454642, "grad_norm": 0.238237970833212, "learning_rate": 8.475962138373212e-07, "loss": -0.0, "num_tokens": 48928475.0, "reward": 0.76171875, "reward_std": 0.13835011422634125, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 301 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.528019770580436e-10, "advantages/std": 0.6185707449913025, "advantages/var": 0.382629766559095, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.2902881536819637, "grad_norm": 0.24635422188933434, "learning_rate": 8.466340146018522e-07, "loss": 0.0, "num_tokens": 49094017.0, "reward": 0.71484375, "reward_std": 0.189907044172287, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 302 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.493818094524657e-09, "advantages/std": 0.46758833527565, "advantages/var": 0.2186388512858537, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.2945570971184632, "grad_norm": 0.15612960075723187, "learning_rate": 8.456693374027378e-07, "loss": -0.0, "num_tokens": 49260136.0, "reward": 0.68359375, "reward_std": 0.10087842494249344, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 303 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.599761390615809e-09, "advantages/std": 0.4049438536167145, "advantages/var": 0.16397952458195508, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.2988260405549625, "grad_norm": 0.22303969723586534, "learning_rate": 8.44702189136113e-07, "loss": -0.0, "num_tokens": 49389252.0, "reward": 0.8046875, "reward_std": 0.0765409916639328, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 304 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.85536048412357e-09, "advantages/std": 0.4374004006385803, "advantages/var": 0.19131911047879058, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.303094983991462, "grad_norm": 0.22184859832120044, "learning_rate": 8.437325767157781e-07, "loss": 0.0, "num_tokens": 49534956.0, "reward": 0.74609375, "reward_std": 0.09837214648723602, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 305 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.953029854679673e-09, "advantages/std": 0.5960754752159119, "advantages/var": 0.35530597215387516, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.3073639274279616, "grad_norm": 0.29883705651901826, "learning_rate": 8.427605070731481e-07, "loss": 0.0, "num_tokens": 49695896.0, "reward": 0.62890625, "reward_std": 0.1817479282617569, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 306 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.102843969045556e-09, "advantages/std": 0.49596521258354187, "advantages/var": 0.24598149209303788, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.3116328708644611, "grad_norm": 0.19332884739544304, "learning_rate": 8.417859871572044e-07, "loss": 0.0, "num_tokens": 49850166.0, "reward": 0.72265625, "reward_std": 0.12677115201950073, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 307 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.49285019678803e-09, "advantages/std": 0.5482980012893677, "advantages/var": 0.30063069821791544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.3159018143009606, "grad_norm": 0.2742934910268536, "learning_rate": 8.408090239344441e-07, "loss": 0.0, "num_tokens": 50005479.0, "reward": 0.69921875, "reward_std": 0.14032843708992004, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 308 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022409433040476e-09, "advantages/std": 0.618571400642395, "advantages/var": 0.3826305776926944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.32017075773746, "grad_norm": 0.29720642123032165, "learning_rate": 8.39829624388831e-07, "loss": 0.0, "num_tokens": 50164299.0, "reward": 0.66015625, "reward_std": 0.18937906622886658, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 309 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907327028325072e-10, "advantages/std": 0.5227845311164856, "advantages/var": 0.2733036659746837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.3244397011739595, "grad_norm": 0.21448321446048993, "learning_rate": 8.38847795521745e-07, "loss": 0.0, "num_tokens": 50316273.0, "reward": 0.71484375, "reward_std": 0.13098734617233276, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 310 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.252507001666207e-09, "advantages/std": 0.5726798176765442, "advantages/var": 0.3279621735740399, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.328708644610459, "grad_norm": 0.24406260372773594, "learning_rate": 8.378635443519326e-07, "loss": 0.0, "num_tokens": 50469383.0, "reward": 0.734375, "reward_std": 0.15585274994373322, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 311 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252496846048193e-09, "advantages/std": 0.5726816058158875, "advantages/var": 0.3279642216398635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.3329775880469583, "grad_norm": 0.27648193971156826, "learning_rate": 8.368768779154562e-07, "loss": 0.0, "num_tokens": 50634882.0, "reward": 0.62890625, "reward_std": 0.15703225135803223, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 312 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.225127068226113e-09, "advantages/std": 0.4959556758403778, "advantages/var": 0.2459720323982859, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 1.3372465314834578, "grad_norm": 0.23643394395444925, "learning_rate": 8.358878032656445e-07, "loss": 0.0, "num_tokens": 50788065.0, "reward": 0.75, "reward_std": 0.11705183982849121, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 313 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.065633752082759e-10, "advantages/std": 0.5726798176765442, "advantages/var": 0.3279621735740399, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 1.3415154749199574, "grad_norm": 0.2949257223504475, "learning_rate": 8.348963274730412e-07, "loss": 0.0, "num_tokens": 50960166.0, "reward": 0.5703125, "reward_std": 0.15585274994373322, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4960011839866638, "step": 314 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.3457844183564567, "grad_norm": 0.21650235168913853, "learning_rate": 8.339024576253553e-07, "loss": -0.0, "num_tokens": 51138096.0, "reward": 0.6484375, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 315 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.859100109905457e-09, "advantages/std": 0.5960744023323059, "advantages/var": 0.3553046931158157, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.3500533617929562, "grad_norm": 0.2614104176232604, "learning_rate": 8.329062008274098e-07, "loss": 0.0, "num_tokens": 51297255.0, "reward": 0.75, "reward_std": 0.18004046380519867, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 316 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755680466206912e-09, "advantages/std": 0.49595409631729126, "advantages/var": 0.24597046565390102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 1.3543223052294557, "grad_norm": 0.22619377392286, "learning_rate": 8.319075642010913e-07, "loss": 0.0, "num_tokens": 51455445.0, "reward": 0.6953125, "reward_std": 0.11481393873691559, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 317 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.0646357329349468e-09, "advantages/std": 0.4373902380466461, "advantages/var": 0.19131022033850176, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 1.358591248665955, "grad_norm": 0.21774584926886661, "learning_rate": 8.309065548852989e-07, "loss": -0.0, "num_tokens": 51597160.0, "reward": 0.71484375, "reward_std": 0.08929947018623352, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 318 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.489578768976071e-09, "advantages/std": 0.46761050820350647, "advantages/var": 0.2186595873823416, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.3628601921024546, "grad_norm": 0.21396670829968895, "learning_rate": 8.299031800358931e-07, "loss": 0.0, "num_tokens": 51734808.0, "reward": 0.7890625, "reward_std": 0.12138034403324127, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 319 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 9.786257849423335e-09, "advantages/std": 0.6185813546180725, "advantages/var": 0.3826428922811296, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.367129135538954, "grad_norm": 0.24862330990714387, "learning_rate": 8.288974468256451e-07, "loss": -0.0, "num_tokens": 51904309.0, "reward": 0.60546875, "reward_std": 0.20304375886917114, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 320 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5624062294134083e-09, "advantages/std": 0.5960822105407715, "advantages/var": 0.3553140017231726, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.3713980789754536, "grad_norm": 0.273560968050573, "learning_rate": 8.278893624441847e-07, "loss": 0.0, "num_tokens": 52076956.0, "reward": 0.68359375, "reward_std": 0.1913485825061798, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 321 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.812061612278906e-10, "advantages/std": 0.5960798859596252, "advantages/var": 0.35531123044563984, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.3756670224119532, "grad_norm": 0.27764714587091543, "learning_rate": 8.268789340979498e-07, "loss": -0.0, "num_tokens": 52236966.0, "reward": 0.71875, "reward_std": 0.18740323185920715, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 322 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.193883478155236e-09, "advantages/std": 0.4373934864997864, "advantages/var": 0.1913130620324388, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.3799359658484525, "grad_norm": 0.18734396601228684, "learning_rate": 8.258661690101345e-07, "loss": 0.0, "num_tokens": 52380779.0, "reward": 0.6640625, "reward_std": 0.09324482083320618, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 323 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.618571400642395, "advantages/var": 0.3826305776926944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.384204909284952, "grad_norm": 0.28559951594842736, "learning_rate": 8.248510744206369e-07, "loss": -0.0, "num_tokens": 52553345.0, "reward": 0.57421875, "reward_std": 0.18937908113002777, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 324 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983447384004837e-09, "advantages/std": 0.46759626269340515, "advantages/var": 0.21864626488483996, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.3884738527214515, "grad_norm": 0.18535459393670384, "learning_rate": 8.238336575860083e-07, "loss": 0.0, "num_tokens": 52697338.0, "reward": 0.83984375, "reward_std": 0.10941824316978455, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.36746934056282043, "step": 325 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.907286406187238e-09, "advantages/std": 0.5227869153022766, "advantages/var": 0.27330615881126974, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.3927427961579508, "grad_norm": 0.22524115569862238, "learning_rate": 8.228139257794012e-07, "loss": 0.0, "num_tokens": 52862363.0, "reward": 0.73828125, "reward_std": 0.1344023048877716, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 326 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.285326628197485e-09, "advantages/std": 0.5726795196533203, "advantages/var": 0.3279618322303577, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.3970117395944504, "grad_norm": 0.2536769347881268, "learning_rate": 8.217918862905162e-07, "loss": -0.0, "num_tokens": 53016775.0, "reward": 0.77734375, "reward_std": 0.15532232820987701, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 327 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5624340390702578e-09, "advantages/std": 0.5960716009140015, "advantages/var": 0.35530135341618063, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 1.4012806830309499, "grad_norm": 0.33070974810083065, "learning_rate": 8.207675464255516e-07, "loss": -0.0, "num_tokens": 53200673.0, "reward": 0.5546875, "reward_std": 0.17715348303318024, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49797385931015015, "step": 328 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131157486115432e-10, "advantages/std": 0.572687566280365, "advantages/var": 0.32797104857212744, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.4055496264674492, "grad_norm": 0.33652698710835777, "learning_rate": 8.197409135071496e-07, "loss": 0.0, "num_tokens": 53348505.0, "reward": 0.7890625, "reward_std": 0.16492542624473572, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 329 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9917200106673123e-09, "advantages/std": 0.4675971269607544, "advantages/var": 0.21864707314195186, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.4098185699039487, "grad_norm": 0.2079452368372239, "learning_rate": 8.187119948743449e-07, "loss": -0.0, "num_tokens": 53509742.0, "reward": 0.66796875, "reward_std": 0.10889027267694473, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 330 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.979264165711202e-10, "advantages/std": 0.4676004946231842, "advantages/var": 0.21865022257184652, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.4140875133404482, "grad_norm": 0.19778389967748286, "learning_rate": 8.176807978825118e-07, "loss": 0.0, "num_tokens": 53673940.0, "reward": 0.74609375, "reward_std": 0.113366037607193, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 331 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33065417408943176, "advantages/var": 0.10933218284276425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 1.4183564567769478, "grad_norm": 0.19611606356891836, "learning_rate": 8.16647329903312e-07, "loss": -0.0, "num_tokens": 53826053.0, "reward": 0.65234375, "reward_std": 0.06404700875282288, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 332 }, { "advantages/mean": -6.28642737865448e-09, "advantages/snr": 1.1465324084482167e-08, "advantages/std": 0.5482991337776184, "advantages/var": 0.3006319401012867, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.4226254002134473, "grad_norm": 0.3003562797692729, "learning_rate": 8.156115983246419e-07, "loss": 0.0, "num_tokens": 53978781.0, "reward": 0.75, "reward_std": 0.14203590154647827, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 333 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 7.421619571772616e-09, "advantages/std": 0.596066951751709, "advantages/var": 0.35529581097057417, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.4268943436499466, "grad_norm": 0.2821425443858336, "learning_rate": 8.145736105505787e-07, "loss": 0.0, "num_tokens": 54130148.0, "reward": 0.7734375, "reward_std": 0.17096778750419617, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 334 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.51129367972865e-09, "advantages/std": 0.495958536863327, "advantages/var": 0.2459748702876121, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 1.4311632870864461, "grad_norm": 0.23600341276988868, "learning_rate": 8.135333740013294e-07, "loss": 0.0, "num_tokens": 54296752.0, "reward": 0.64453125, "reward_std": 0.12099719047546387, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 335 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983355987268271e-09, "advantages/std": 0.4676069915294647, "advantages/var": 0.2186562985272369, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.4354322305229457, "grad_norm": 0.1762747193777988, "learning_rate": 8.124908961131757e-07, "loss": 0.0, "num_tokens": 54461352.0, "reward": 0.765625, "reward_std": 0.11849336326122284, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 336 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.090908300523042e-09, "advantages/std": 0.6402843594551086, "advantages/var": 0.40996406096283877, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.439701173959445, "grad_norm": 0.24480847749808787, "learning_rate": 8.114461843384228e-07, "loss": 0.0, "num_tokens": 54623714.0, "reward": 0.79296875, "reward_std": 0.20608291029930115, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 337 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 4.0246066868757e-09, "advantages/std": 0.404962420463562, "advantages/var": 0.1639945619877068, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.4439701173959445, "grad_norm": 0.19200558195614426, "learning_rate": 8.103992461453445e-07, "loss": 0.0, "num_tokens": 54760860.0, "reward": 0.74609375, "reward_std": 0.09244601428508759, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 338 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726826786994934, "advantages/var": 0.3279654504824272, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.448239060832444, "grad_norm": 0.2429698349876259, "learning_rate": 8.093500890181307e-07, "loss": 0.0, "num_tokens": 54927519.0, "reward": 0.6328125, "reward_std": 0.15873971581459045, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 339 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.816677681974272e-09, "advantages/std": 0.6612915396690369, "advantages/var": 0.43730650043784536, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.4525080042689433, "grad_norm": 0.29709285080753056, "learning_rate": 8.082987204568335e-07, "loss": -0.0, "num_tokens": 55096190.0, "reward": 0.67578125, "reward_std": 0.23185941576957703, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 340 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.323084000677362e-09, "advantages/std": 0.4373980164527893, "advantages/var": 0.19131702479683454, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.4567769477054429, "grad_norm": 0.2037369960547768, "learning_rate": 8.072451479773143e-07, "loss": 0.0, "num_tokens": 55262990.0, "reward": 0.703125, "reward_std": 0.09719263017177582, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 341 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.9530466500626467e-09, "advantages/std": 0.5960703492164612, "advantages/var": 0.355299861215034, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.4610458911419424, "grad_norm": 0.2637448981496169, "learning_rate": 8.061893791111886e-07, "loss": 0.0, "num_tokens": 55413687.0, "reward": 0.71875, "reward_std": 0.17662061750888824, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 342 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.453634063173048e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.465314834578442, "grad_norm": 0.2520712775108035, "learning_rate": 8.05131421405774e-07, "loss": -0.0, "num_tokens": 55573021.0, "reward": 0.75, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 343 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.1336897468247492e-08, "advantages/std": 0.36967357993125916, "advantages/var": 0.13665855569919305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.4695837780149412, "grad_norm": 0.167234859844493, "learning_rate": 8.040712824240347e-07, "loss": 0.0, "num_tokens": 55724996.0, "reward": 0.76953125, "reward_std": 0.07232724130153656, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 344 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970936510266691e-09, "advantages/std": 0.4676027297973633, "advantages/var": 0.21865231291394593, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.4738527214514408, "grad_norm": 0.1888848391325854, "learning_rate": 8.030089697445286e-07, "loss": 0.0, "num_tokens": 55891954.0, "reward": 0.640625, "reward_std": 0.1145455539226532, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 345 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2268017985210387e-09, "advantages/std": 0.5227915644645691, "advantages/var": 0.2733110198753117, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.4781216648879403, "grad_norm": 0.23996629606975922, "learning_rate": 8.019444909613521e-07, "loss": -0.0, "num_tokens": 56047635.0, "reward": 0.76171875, "reward_std": 0.13952717185020447, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 346 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.687330240004959e-09, "advantages/std": 0.5960680246353149, "advantages/var": 0.3552970899926464, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.4823906083244398, "grad_norm": 0.2624983197756317, "learning_rate": 8.008778536840867e-07, "loss": 0.0, "num_tokens": 56211206.0, "reward": 0.59765625, "reward_std": 0.1726752519607544, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4913311004638672, "step": 347 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 7.643456328090667e-09, "advantages/std": 0.5483058094978333, "advantages/var": 0.3006392607290742, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.4866595517609391, "grad_norm": 0.23557605254214864, "learning_rate": 7.998090655377441e-07, "loss": 0.0, "num_tokens": 56390588.0, "reward": 0.578125, "reward_std": 0.14887069165706635, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49482619762420654, "step": 348 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.7897975545984724e-09, "advantages/std": 0.5227813720703125, "advantages/var": 0.2733003629837185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.4909284951974386, "grad_norm": 0.21543497640620152, "learning_rate": 7.987381341627116e-07, "loss": 0.0, "num_tokens": 56573406.0, "reward": 0.64453125, "reward_std": 0.12810038030147552, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 349 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878720895786924e-09, "advantages/std": 0.5726844668388367, "advantages/var": 0.3279674985584826, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.4951974386339382, "grad_norm": 0.26031703247446014, "learning_rate": 7.976650672146976e-07, "loss": 0.0, "num_tokens": 56734640.0, "reward": 0.65234375, "reward_std": 0.15991923213005066, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 350 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.545457467686302e-09, "advantages/std": 0.6402835249900818, "advantages/var": 0.4099629923737247, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.4994663820704375, "grad_norm": 0.26914821743698814, "learning_rate": 7.965898723646776e-07, "loss": 0.0, "num_tokens": 56899076.0, "reward": 0.671875, "reward_std": 0.20608046650886536, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 351 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.1939374485847554e-09, "advantages/std": 0.43738609552383423, "advantages/var": 0.19130659655758464, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.503735325506937, "grad_norm": 0.17746551797009705, "learning_rate": 7.955125572988381e-07, "loss": 0.0, "num_tokens": 57044649.0, "reward": 0.796875, "reward_std": 0.08588206768035889, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 352 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.694597197528499e-10, "advantages/std": 0.4959544539451599, "advantages/var": 0.24597082038804174, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.5080042689434365, "grad_norm": 0.2406487583945076, "learning_rate": 7.944331297185222e-07, "loss": -0.0, "num_tokens": 57190732.0, "reward": 0.66015625, "reward_std": 0.11534436792135239, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 353 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562954778661877e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.5122732123799358, "grad_norm": 0.260133206173505, "learning_rate": 7.933515973401754e-07, "loss": 0.0, "num_tokens": 57351877.0, "reward": 0.6953125, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 354 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.125665828442654e-09, "advantages/std": 0.5227988958358765, "advantages/var": 0.2733186854872116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.5165421558164356, "grad_norm": 0.25786833797545927, "learning_rate": 7.922679678952888e-07, "loss": -0.0, "num_tokens": 57501527.0, "reward": 0.7421875, "reward_std": 0.14689236879348755, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 355 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755686559635449e-09, "advantages/std": 0.4959532916545868, "advantages/var": 0.24596966750301963, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 1.520811099252935, "grad_norm": 0.23675947330341743, "learning_rate": 7.911822491303452e-07, "loss": -0.0, "num_tokens": 57632583.0, "reward": 0.8125, "reward_std": 0.1153419092297554, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.3910769522190094, "step": 356 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5228006839752197, "advantages/var": 0.27332055516495757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.5250800426894342, "grad_norm": 0.246557803675931, "learning_rate": 7.900944488067628e-07, "loss": -0.0, "num_tokens": 57791374.0, "reward": 0.63671875, "reward_std": 0.14966067671775818, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 357 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.00822312992985e-09, "advantages/std": 0.5227941870689392, "advantages/var": 0.273313762033073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 1.529348986125934, "grad_norm": 0.25317587277774867, "learning_rate": 7.890045747008405e-07, "loss": 0.0, "num_tokens": 57939727.0, "reward": 0.765625, "reward_std": 0.1417675018310547, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 358 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.489676036788974e-09, "advantages/std": 0.4675922393798828, "advantages/var": 0.21864250232829363, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.5336179295624333, "grad_norm": 0.22174911867779942, "learning_rate": 7.879126346037018e-07, "loss": -0.0, "num_tokens": 58074464.0, "reward": 0.78125, "reward_std": 0.10429581999778748, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 359 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.3876010625689843e-09, "advantages/std": 0.6185721755027771, "advantages/var": 0.3826315363062385, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.5378868729989328, "grad_norm": 0.33116736798190893, "learning_rate": 7.86818636321239e-07, "loss": 0.0, "num_tokens": 58247849.0, "reward": 0.66796875, "reward_std": 0.190556138753891, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 360 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2998135000829877e-09, "advantages/std": 0.4049556851387024, "advantages/var": 0.16398910692615587, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.5421558164354323, "grad_norm": 0.18075330496000655, "learning_rate": 7.857225876740583e-07, "loss": -0.0, "num_tokens": 58385344.0, "reward": 0.7734375, "reward_std": 0.0866745114326477, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 361 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.226817031586524e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.5464247598719316, "grad_norm": 0.2604824454603905, "learning_rate": 7.846244964974224e-07, "loss": 0.0, "num_tokens": 58548454.0, "reward": 0.765625, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 362 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.94480707872331e-09, "advantages/std": 0.5483153462409973, "advantages/var": 0.30064971892338477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.5506937033084311, "grad_norm": 0.3140737111444032, "learning_rate": 7.83524370641196e-07, "loss": 0.0, "num_tokens": 58705975.0, "reward": 0.7421875, "reward_std": 0.16018126904964447, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 363 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.34441632512063e-09, "advantages/std": 0.522782564163208, "advantages/var": 0.2733016093930587, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 1.5549626467449307, "grad_norm": 0.19446400216043178, "learning_rate": 7.824222179697884e-07, "loss": 0.0, "num_tokens": 58874863.0, "reward": 0.6484375, "reward_std": 0.12980784475803375, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 364 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.9724352049941504e-09, "advantages/std": 0.5483095049858093, "advantages/var": 0.30064331325778326, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.55923159018143, "grad_norm": 0.19758478072310584, "learning_rate": 7.813180463620985e-07, "loss": 0.0, "num_tokens": 59023297.0, "reward": 0.75, "reward_std": 0.15452352166175842, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 365 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.7640222169579895e-09, "advantages/std": 0.6185687184333801, "advantages/var": 0.3826272594243143, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 1.5635005336179295, "grad_norm": 0.28858573770761964, "learning_rate": 7.802118637114573e-07, "loss": -0.0, "num_tokens": 59207565.0, "reward": 0.58984375, "reward_std": 0.18649210035800934, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 366 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.7896985381826545e-09, "advantages/std": 0.5227903127670288, "advantages/var": 0.2733097111230478, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.567769477054429, "grad_norm": 0.30868263813123614, "learning_rate": 7.791036779255726e-07, "loss": 0.0, "num_tokens": 59343474.0, "reward": 0.7109375, "reward_std": 0.1361146867275238, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 367 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.4721687339672804e-09, "advantages/std": 0.5726834535598755, "advantages/var": 0.32796633798126607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.5720384204909283, "grad_norm": 0.22317988188778135, "learning_rate": 7.779934969264712e-07, "loss": -0.0, "num_tokens": 59505174.0, "reward": 0.734375, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 368 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 8.461828504089582e-09, "advantages/std": 0.5227926969528198, "advantages/var": 0.2733122039872029, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.576307363927428, "grad_norm": 0.26201326633264177, "learning_rate": 7.768813286504438e-07, "loss": 0.0, "num_tokens": 59669277.0, "reward": 0.6328125, "reward_std": 0.13952963054180145, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 369 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.1175139874944796e-09, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.5805763073639274, "grad_norm": 0.21229764390237946, "learning_rate": 7.757671810479864e-07, "loss": 0.0, "num_tokens": 59824332.0, "reward": 0.66015625, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 370 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2267555928306024e-09, "advantages/std": 0.5228024125099182, "advantages/var": 0.2733223625261907, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.584845250800427, "grad_norm": 0.21745259260943325, "learning_rate": 7.746510620837458e-07, "loss": -0.0, "num_tokens": 59987591.0, "reward": 0.73828125, "reward_std": 0.15030977129936218, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 371 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.07788625415326e-09, "advantages/std": 0.5960744619369507, "advantages/var": 0.35530476417342527, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.5891141942369265, "grad_norm": 0.265702997455518, "learning_rate": 7.735329797364605e-07, "loss": 0.0, "num_tokens": 60145919.0, "reward": 0.7265625, "reward_std": 0.1817454695701599, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 372 }, { "advantages/mean": -6.984919309616089e-09, "advantages/snr": 1.1291838156199155e-08, "advantages/std": 0.618581235408783, "advantages/var": 0.38264274479985616, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.5933831376734258, "grad_norm": 0.26508239510597087, "learning_rate": 7.724129419989043e-07, "loss": 0.0, "num_tokens": 60318193.0, "reward": 0.72265625, "reward_std": 0.2013387382030487, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 373 }, { "advantages/mean": -7.683411240577698e-09, "advantages/snr": 1.4012945382601492e-08, "advantages/std": 0.5483080744743347, "advantages/var": 0.3006417445337526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.5976520811099253, "grad_norm": 0.2546758923492174, "learning_rate": 7.712909568778301e-07, "loss": -0.0, "num_tokens": 60494194.0, "reward": 0.625, "reward_std": 0.152285635471344, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4850712716579437, "step": 374 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.898950551777039e-09, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.6019210245464248, "grad_norm": 0.22292132323131664, "learning_rate": 7.701670323939116e-07, "loss": -0.0, "num_tokens": 60656586.0, "reward": 0.67578125, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 375 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.1248730777263736e-09, "advantages/std": 0.5960706472396851, "advantages/var": 0.35530021650073706, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.6061899679829241, "grad_norm": 0.29603461478600207, "learning_rate": 7.690411765816864e-07, "loss": -0.0, "num_tokens": 60806490.0, "reward": 0.69921875, "reward_std": 0.17715102434158325, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 376 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.775109422012961e-09, "advantages/std": 0.6185806393623352, "advantages/var": 0.3826420073939154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.6104589114194237, "grad_norm": 0.32902896731621456, "learning_rate": 7.679133974894982e-07, "loss": 0.0, "num_tokens": 60963003.0, "reward": 0.69921875, "reward_std": 0.20186668634414673, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 377 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 8.280908722794443e-09, "advantages/std": 0.618564248085022, "advantages/var": 0.3826217290089886, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.6147278548559232, "grad_norm": 0.25689944731144304, "learning_rate": 7.667837031794403e-07, "loss": -0.0, "num_tokens": 61120387.0, "reward": 0.73828125, "reward_std": 0.18030640482902527, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 378 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.272710617665313e-09, "advantages/std": 0.6402875185012817, "advantages/var": 0.4099681063485292, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.6189967982924225, "grad_norm": 0.2899400646000739, "learning_rate": 7.656521017272963e-07, "loss": 0.0, "num_tokens": 61304243.0, "reward": 0.51953125, "reward_std": 0.21003073453903198, "rewards/drgrpo_math_reward/mean": 0.51953125, "rewards/drgrpo_math_reward/std": 0.5005971193313599, "step": 379 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.492624931018031e-10, "advantages/std": 0.5483125448226929, "advantages/var": 0.3006466468099376, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.6232657417289222, "grad_norm": 0.22626008132108802, "learning_rate": 7.645186012224838e-07, "loss": 0.0, "num_tokens": 61448061.0, "reward": 0.796875, "reward_std": 0.15741050243377686, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 380 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 5.17467405816827e-09, "advantages/std": 0.404948353767395, "advantages/var": 0.1639831692189233, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.6275346851654215, "grad_norm": 0.19854078916882764, "learning_rate": 7.633832097679957e-07, "loss": -0.0, "num_tokens": 61598324.0, "reward": 0.71484375, "reward_std": 0.07995839416980743, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 381 }, { "advantages/mean": 5.3551048040390015e-09, "advantages/snr": 1.0243059377680273e-08, "advantages/std": 0.5228032469749451, "advantages/var": 0.2733232350475454, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.631803628601921, "grad_norm": 0.20930490430516044, "learning_rate": 7.622459354803434e-07, "loss": -0.0, "num_tokens": 61758043.0, "reward": 0.67578125, "reward_std": 0.1514868289232254, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 382 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.090890540632927e-09, "advantages/std": 0.640295147895813, "advantages/var": 0.40997787641892103, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.6360725720384206, "grad_norm": 0.2816231565987675, "learning_rate": 7.611067864894971e-07, "loss": -0.0, "num_tokens": 61915425.0, "reward": 0.62109375, "reward_std": 0.2202804535627365, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 383 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907327028325072e-10, "advantages/std": 0.5227845311164856, "advantages/var": 0.2733036659746837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.64034151547492, "grad_norm": 0.233848130807371, "learning_rate": 7.59965770938829e-07, "loss": -0.0, "num_tokens": 62058323.0, "reward": 0.78515625, "reward_std": 0.13098736107349396, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 384 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.794348108931506e-09, "advantages/std": 0.5482925176620483, "advantages/var": 0.3006246849241876, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.6446104589114194, "grad_norm": 0.2823718727784672, "learning_rate": 7.588228969850548e-07, "loss": -0.0, "num_tokens": 62212329.0, "reward": 0.7265625, "reward_std": 0.1352011114358902, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 385 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.065573242237083e-10, "advantages/std": 0.5726883411407471, "advantages/var": 0.3279719360785407, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 1.648879402347919, "grad_norm": 0.28800478049676365, "learning_rate": 7.576781727981749e-07, "loss": 0.0, "num_tokens": 62382068.0, "reward": 0.6171875, "reward_std": 0.16610248386859894, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 386 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.4646680582978397e-09, "advantages/std": 0.6612713932991028, "advantages/var": 0.4372798555957367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.6531483457844183, "grad_norm": 0.27720319557160134, "learning_rate": 7.565316065614167e-07, "loss": 0.0, "num_tokens": 62530940.0, "reward": 0.74609375, "reward_std": 0.20464137196540833, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 387 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.9724914293101917e-09, "advantages/std": 0.5482991337776184, "advantages/var": 0.3006319401012867, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.6574172892209178, "grad_norm": 0.22444635976189184, "learning_rate": 7.553832064711756e-07, "loss": -0.0, "num_tokens": 62687916.0, "reward": 0.7421875, "reward_std": 0.14203590154647827, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 388 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.7753000537295894e-09, "advantages/std": 0.6185632348060608, "advantages/var": 0.3826204754537379, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.6616862326574173, "grad_norm": 0.2803792143949797, "learning_rate": 7.542329807369565e-07, "loss": 0.0, "num_tokens": 62831060.0, "reward": 0.8203125, "reward_std": 0.17859894037246704, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 389 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5969403245552258e-09, "advantages/std": 0.4373938739299774, "advantages/var": 0.19131340095147298, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.6659551760939166, "grad_norm": 0.18879394263981103, "learning_rate": 7.530809375813155e-07, "loss": -0.0, "num_tokens": 62993130.0, "reward": 0.72265625, "reward_std": 0.09377524256706238, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 390 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.4720812289421775e-09, "advantages/std": 0.5726946592330933, "advantages/var": 0.3279791727141088, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.6702241195304164, "grad_norm": 0.24513909553197558, "learning_rate": 7.519270852398001e-07, "loss": -0.0, "num_tokens": 63157176.0, "reward": 0.7109375, "reward_std": 0.17123225331306458, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 391 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.980860316001973e-09, "advantages/std": 0.49595165252685547, "advantages/var": 0.24596804164411878, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.6744930629669157, "grad_norm": 0.24584737442182014, "learning_rate": 7.507714319608921e-07, "loss": 0.0, "num_tokens": 63306499.0, "reward": 0.6953125, "reward_std": 0.11310401558876038, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 392 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516781396180255e-09, "advantages/std": 0.6185749173164368, "advantages/var": 0.3826349283330366, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.678762006403415, "grad_norm": 0.33599920290529195, "learning_rate": 7.496139860059467e-07, "loss": 0.0, "num_tokens": 63483796.0, "reward": 0.546875, "reward_std": 0.1938573122024536, "rewards/drgrpo_math_reward/mean": 0.546875, "rewards/drgrpo_math_reward/std": 0.4987730085849762, "step": 393 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.9449311575715845e-09, "advantages/std": 0.5483039021492004, "advantages/var": 0.30063716911203997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.6830309498399147, "grad_norm": 0.23643434716983186, "learning_rate": 7.484547556491345e-07, "loss": 0.0, "num_tokens": 63637886.0, "reward": 0.73046875, "reward_std": 0.14769117534160614, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 394 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.3694770070372925e-09, "advantages/std": 0.5483118295669556, "advantages/var": 0.30064586244306213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.687299893276414, "grad_norm": 0.263495448332587, "learning_rate": 7.472937491773823e-07, "loss": 0.0, "num_tokens": 63799240.0, "reward": 0.64453125, "reward_std": 0.15835265815258026, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 395 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.13136821560598e-09, "advantages/std": 0.5726727247238159, "advantages/var": 0.32795404964259944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.6915688367129136, "grad_norm": 0.24958196804145655, "learning_rate": 7.461309748903137e-07, "loss": -0.0, "num_tokens": 63958086.0, "reward": 0.71875, "reward_std": 0.1462520956993103, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 396 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.672133215912294e-09, "advantages/std": 0.5227972269058228, "advantages/var": 0.2733169404604183, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.695837780149413, "grad_norm": 0.2619349062534, "learning_rate": 7.449664411001897e-07, "loss": -0.0, "num_tokens": 64114337.0, "reward": 0.6484375, "reward_std": 0.14624328911304474, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 397 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.816697216482564e-09, "advantages/std": 0.49596521258354187, "advantages/var": 0.24598149209303788, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.7001067235859124, "grad_norm": 0.25131872608530903, "learning_rate": 7.438001561318494e-07, "loss": -0.0, "num_tokens": 64269432.0, "reward": 0.69921875, "reward_std": 0.12677115201950073, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 398 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.098235462887327e-09, "advantages/std": 0.572700023651123, "advantages/var": 0.3279853170899969, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.704375667022412, "grad_norm": 0.24340881384990734, "learning_rate": 7.426321283226503e-07, "loss": -0.0, "num_tokens": 64426570.0, "reward": 0.62890625, "reward_std": 0.1797696202993393, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 399 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4393045940057892e-09, "advantages/std": 0.5726975798606873, "advantages/var": 0.32798251797828826, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.7086446104589115, "grad_norm": 0.26003316162281526, "learning_rate": 7.414623660224093e-07, "loss": 0.0, "num_tokens": 64586322.0, "reward": 0.7109375, "reward_std": 0.17582425475120544, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 400 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4937602088770592e-09, "advantages/std": 0.46760645508766174, "advantages/var": 0.21865579683964942, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.7129135538954108, "grad_norm": 0.21370170671538638, "learning_rate": 7.402908775933419e-07, "loss": 0.0, "num_tokens": 64735034.0, "reward": 0.63671875, "reward_std": 0.11955174803733826, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 401 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.658925283782654e-09, "advantages/std": 0.5727025270462036, "advantages/var": 0.3279881844851076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.7171824973319103, "grad_norm": 0.29177020368304496, "learning_rate": 7.391176714100037e-07, "loss": 0.0, "num_tokens": 64874775.0, "reward": 0.7421875, "reward_std": 0.18371497094631195, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 402 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.1491582420370604e-09, "advantages/std": 0.3696712255477905, "advantages/var": 0.13665681499800542, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.7214514407684098, "grad_norm": 0.16600266818960302, "learning_rate": 7.379427558592295e-07, "loss": -0.0, "num_tokens": 65039992.0, "reward": 0.51171875, "reward_std": 0.07167815417051315, "rewards/drgrpo_math_reward/mean": 0.51171875, "rewards/drgrpo_math_reward/std": 0.5008418560028076, "step": 403 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691739153544274e-09, "advantages/std": 0.572694718837738, "advantages/var": 0.3279792409846358, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.7257203842049091, "grad_norm": 0.2684923274817556, "learning_rate": 7.36766139340074e-07, "loss": -0.0, "num_tokens": 65185373.0, "reward": 0.7421875, "reward_std": 0.172937273979187, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 404 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2739001097361383e-09, "advantages/std": 0.5483098030090332, "advantages/var": 0.3006436400758048, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.729989327641409, "grad_norm": 0.22906645703144418, "learning_rate": 7.355878302637514e-07, "loss": -0.0, "num_tokens": 65340303.0, "reward": 0.75390625, "reward_std": 0.15505394339561462, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 405 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814463133777788e-09, "advantages/std": 0.5227901339530945, "advantages/var": 0.2733095241586945, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.7342582710779082, "grad_norm": 0.23426357271085818, "learning_rate": 7.344078370535755e-07, "loss": -0.0, "num_tokens": 65494084.0, "reward": 0.6640625, "reward_std": 0.13770347833633423, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 406 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1291963283089938e-09, "advantages/std": 0.6185743808746338, "advantages/var": 0.3826342646744365, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.7385272145144077, "grad_norm": 0.3011024733459177, "learning_rate": 7.332261681448995e-07, "loss": 0.0, "num_tokens": 65648246.0, "reward": 0.7421875, "reward_std": 0.1927964836359024, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 407 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.95304215824588e-09, "advantages/std": 0.596071720123291, "advantages/var": 0.355301495530739, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.7427961579509073, "grad_norm": 0.2925011236710639, "learning_rate": 7.320428319850549e-07, "loss": 0.0, "num_tokens": 65828462.0, "reward": 0.6015625, "reward_std": 0.17885848879814148, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4905354380607605, "step": 408 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.453650819722753e-10, "advantages/std": 0.522786021232605, "advantages/var": 0.2733052239962177, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.7470651013874066, "grad_norm": 0.23509345702783163, "learning_rate": 7.308578370332925e-07, "loss": -0.0, "num_tokens": 65987757.0, "reward": 0.77734375, "reward_std": 0.1332252472639084, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 409 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.323123171578266e-10, "advantages/std": 0.43739479780197144, "advantages/var": 0.19131420914422748, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 1.751334044823906, "grad_norm": 0.18315104597430326, "learning_rate": 7.29671191760721e-07, "loss": 0.0, "num_tokens": 66130350.0, "reward": 0.69921875, "reward_std": 0.09324727952480316, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 410 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 7.904269953437837e-09, "advantages/std": 0.618582546710968, "advantages/var": 0.38264436709542693, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.7556029882604056, "grad_norm": 0.3246528040350126, "learning_rate": 7.284829046502467e-07, "loss": 0.0, "num_tokens": 66288313.0, "reward": 0.73046875, "reward_std": 0.20357662439346313, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 411 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.2462903089234764e-10, "advantages/std": 0.5483154058456421, "advantages/var": 0.3006497842876712, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.759871931696905, "grad_norm": 0.3002740331443812, "learning_rate": 7.272929841965126e-07, "loss": 0.0, "num_tokens": 66438066.0, "reward": 0.5625, "reward_std": 0.1618862748146057, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49705013632774353, "step": 412 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.75557733228642e-09, "advantages/std": 0.49596771597862244, "advantages/var": 0.2459839752930515, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 1.7641408751334045, "grad_norm": 0.22961791009181387, "learning_rate": 7.261014389058382e-07, "loss": 0.0, "num_tokens": 66586048.0, "reward": 0.73828125, "reward_std": 0.13018609583377838, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 413 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 7.421606213341186e-09, "advantages/std": 0.5960680246353149, "advantages/var": 0.3552970899926464, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.768409818569904, "grad_norm": 0.29006183656939377, "learning_rate": 7.249082772961582e-07, "loss": 0.0, "num_tokens": 66734309.0, "reward": 0.73828125, "reward_std": 0.1726752519607544, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 414 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.4535812554776114e-10, "advantages/std": 0.5227941870689392, "advantages/var": 0.273313762033073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.7726787620064033, "grad_norm": 0.24788218742175322, "learning_rate": 7.237135078969618e-07, "loss": -0.0, "num_tokens": 66888872.0, "reward": 0.7265625, "reward_std": 0.1417675018310547, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 415 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.7343076901567512e-09, "advantages/std": 0.596061110496521, "advantages/var": 0.3552888474463458, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.776947705442903, "grad_norm": 0.24947353174478817, "learning_rate": 7.225171392492315e-07, "loss": -0.0, "num_tokens": 67070163.0, "reward": 0.49609375, "reward_std": 0.1646634042263031, "rewards/drgrpo_math_reward/mean": 0.49609375, "rewards/drgrpo_math_reward/std": 0.5009641647338867, "step": 416 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7814572812374478e-09, "advantages/std": 0.5227869153022766, "advantages/var": 0.27330615881126974, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 1.7812166488794023, "grad_norm": 0.24503746256418613, "learning_rate": 7.21319179905383e-07, "loss": -0.0, "num_tokens": 67227683.0, "reward": 0.69921875, "reward_std": 0.1344023048877716, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 417 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.818178206786404e-09, "advantages/std": 0.6402846574783325, "advantages/var": 0.4099644426021456, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.7854855923159016, "grad_norm": 0.27811569628160054, "learning_rate": 7.201196384292026e-07, "loss": 0.0, "num_tokens": 67384174.0, "reward": 0.6484375, "reward_std": 0.20661331713199615, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 418 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.38902400835478e-10, "advantages/std": 0.495963454246521, "advantages/var": 0.24597974794814093, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.7897545357524014, "grad_norm": 0.21451958553347664, "learning_rate": 7.189185233957867e-07, "loss": 0.0, "num_tokens": 67546554.0, "reward": 0.69140625, "reward_std": 0.12612205743789673, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 419 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.2861694609705216e-09, "advantages/std": 0.4959617853164673, "advantages/var": 0.24597809249429758, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.7940234791889007, "grad_norm": 0.23429005770960726, "learning_rate": 7.17715843391481e-07, "loss": 0.0, "num_tokens": 67697689.0, "reward": 0.71484375, "reward_std": 0.12217916548252106, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 420 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.548311710357666, "advantages/var": 0.30064573171534903, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.7982924226254002, "grad_norm": 0.2657669321526178, "learning_rate": 7.165116070138182e-07, "loss": -0.0, "num_tokens": 67847963.0, "reward": 0.703125, "reward_std": 0.15623344480991364, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 421 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.234897888925693e-09, "advantages/std": 0.5228039026260376, "advantages/var": 0.2733239206010154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.8025613660618998, "grad_norm": 0.2325526086698693, "learning_rate": 7.153058228714573e-07, "loss": -0.0, "num_tokens": 67999484.0, "reward": 0.77734375, "reward_std": 0.1525476574897766, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 422 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 4.523380573164966e-09, "advantages/std": 0.7206178903579712, "advantages/var": 0.519290143903973, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.806830309498399, "grad_norm": 0.29965403183055045, "learning_rate": 7.140984995841213e-07, "loss": 0.0, "num_tokens": 68169225.0, "reward": 0.70703125, "reward_std": 0.26329755783081055, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 423 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4675910174846649, "advantages/var": 0.2186413596323442, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.8110992529348986, "grad_norm": 0.22870163723664103, "learning_rate": 7.128896457825363e-07, "loss": 0.0, "num_tokens": 68295369.0, "reward": 0.84765625, "reward_std": 0.10429336875677109, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.3600577116012573, "step": 424 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.75558162001709e-09, "advantages/std": 0.49596714973449707, "advantages/var": 0.24598341361576104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.8153681963713981, "grad_norm": 0.21145463928490568, "learning_rate": 7.116792701083696e-07, "loss": 0.0, "num_tokens": 68449788.0, "reward": 0.78125, "reward_std": 0.12953945994377136, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 425 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.090965171223325e-09, "advantages/std": 0.6402772068977356, "advantages/var": 0.4099549016727657, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.8196371398078974, "grad_norm": 0.3193763108454424, "learning_rate": 7.104673812141675e-07, "loss": -0.0, "num_tokens": 68611334.0, "reward": 0.6875, "reward_std": 0.19647981226444244, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 426 }, { "advantages/mean": -6.28642737865448e-09, "advantages/snr": 9.506647061701755e-09, "advantages/std": 0.6612665057182312, "advantages/var": 0.4372733915847995, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.823906083244397, "grad_norm": 0.30199023604753306, "learning_rate": 7.092539877632939e-07, "loss": 0.0, "num_tokens": 68777395.0, "reward": 0.75390625, "reward_std": 0.19727861881256104, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 427 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2196798431161686e-09, "advantages/std": 0.572684645652771, "advantages/var": 0.3279677033664399, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.8281750266808965, "grad_norm": 0.2840608093900212, "learning_rate": 7.080390984298686e-07, "loss": 0.0, "num_tokens": 68935328.0, "reward": 0.6171875, "reward_std": 0.16203844547271729, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 428 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.1125556422290086e-09, "advantages/std": 0.3306383490562439, "advantages/var": 0.10932171786663858, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.8324439701173958, "grad_norm": 0.14263664676729204, "learning_rate": 7.068227218987042e-07, "loss": -0.0, "num_tokens": 69075377.0, "reward": 0.84765625, "reward_std": 0.05273643508553505, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.3600577116012573, "step": 429 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.5478365019499404e-09, "advantages/std": 0.5483019948005676, "advantages/var": 0.3006350775022817, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.8367129135538955, "grad_norm": 0.25853283598907206, "learning_rate": 7.056048668652454e-07, "loss": 0.0, "num_tokens": 69226123.0, "reward": 0.796875, "reward_std": 0.14651167392730713, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 430 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.694585349261446e-09, "advantages/std": 0.4959557056427002, "advantages/var": 0.24597206195954868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 1.8409818569903948, "grad_norm": 0.2377317602126353, "learning_rate": 7.04385542035506e-07, "loss": 0.0, "num_tokens": 69377157.0, "reward": 0.734375, "reward_std": 0.11705183982849121, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 431 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.511243128167531e-09, "advantages/std": 0.49596187472343445, "advantages/var": 0.24597818117918369, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 1.8452508004268944, "grad_norm": 0.26899356838560157, "learning_rate": 7.031647561260065e-07, "loss": -0.0, "num_tokens": 69547761.0, "reward": 0.51171875, "reward_std": 0.1238841786980629, "rewards/drgrpo_math_reward/mean": 0.51171875, "rewards/drgrpo_math_reward/std": 0.5008418560028076, "step": 432 }, { "advantages/mean": -5.3551048040390015e-09, "advantages/snr": 1.0243288273196778e-08, "advantages/std": 0.5227915644645691, "advantages/var": 0.2733110198753117, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.849519743863394, "grad_norm": 0.2671321396595432, "learning_rate": 7.019425178637126e-07, "loss": -0.0, "num_tokens": 69704764.0, "reward": 0.72265625, "reward_std": 0.13952717185020447, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 433 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 1.220555812162176e-08, "advantages/std": 0.4959704875946045, "advantages/var": 0.24598672456482973, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.8537886872998932, "grad_norm": 0.22714143912809598, "learning_rate": 7.007188359859726e-07, "loss": -0.0, "num_tokens": 69871863.0, "reward": 0.5703125, "reward_std": 0.1324264407157898, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4960011839866638, "step": 434 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.572358668578157e-09, "advantages/std": 0.4959602952003479, "advantages/var": 0.24597661441521623, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.8580576307363927, "grad_norm": 0.21561949231343305, "learning_rate": 6.994937192404537e-07, "loss": -0.0, "num_tokens": 70037208.0, "reward": 0.61328125, "reward_std": 0.12164628505706787, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 435 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.1030515379783765e-09, "advantages/std": 0.49594834446907043, "advantages/var": 0.24596476038161175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 1.8623265741728923, "grad_norm": 0.2307172453911008, "learning_rate": 6.982671763850814e-07, "loss": 0.0, "num_tokens": 70169422.0, "reward": 0.796875, "reward_std": 0.11021704226732254, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 436 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.477184298733457e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 1.8665955176093916, "grad_norm": 0.2325116554205341, "learning_rate": 6.970392161879755e-07, "loss": 0.0, "num_tokens": 70307647.0, "reward": 0.7890625, "reward_std": 0.11230767518281937, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 437 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.1498931253727876e-09, "advantages/std": 0.4049604833126068, "advantages/var": 0.1639929930447801, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.870864461045891, "grad_norm": 0.19222639805887112, "learning_rate": 6.95809847427388e-07, "loss": -0.0, "num_tokens": 70451518.0, "reward": 0.74609375, "reward_std": 0.09020812809467316, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 438 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 9.409914999290637e-09, "advantages/std": 0.6185779571533203, "advantages/var": 0.382638689075975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.8751334044823906, "grad_norm": 0.33670992741302663, "learning_rate": 6.945790788916401e-07, "loss": 0.0, "num_tokens": 70590710.0, "reward": 0.80078125, "reward_std": 0.1989797204732895, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 439 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.979278129208303e-10, "advantages/std": 0.46759918332099915, "advantages/var": 0.21864899624246537, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 1.87940234791889, "grad_norm": 0.22493791368848148, "learning_rate": 6.933469193790599e-07, "loss": 0.0, "num_tokens": 70742283.0, "reward": 0.75, "reward_std": 0.11165857315063477, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 440 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 7.452157657712755e-09, "advantages/std": 0.43740740418434143, "advantages/var": 0.19132523723528383, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.8836712913553897, "grad_norm": 0.2144476365775843, "learning_rate": 6.921133776979186e-07, "loss": -0.0, "num_tokens": 70905201.0, "reward": 0.640625, "reward_std": 0.10520448535680771, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 441 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.43740326166152954, "advantages/var": 0.19132161331214448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.887940234791889, "grad_norm": 0.23029173426421345, "learning_rate": 6.908784626663681e-07, "loss": -0.0, "num_tokens": 71047781.0, "reward": 0.66796875, "reward_std": 0.10178709030151367, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 442 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.389170698522892e-10, "advantages/std": 0.4959557056427002, "advantages/var": 0.24597206195954868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.8922091782283885, "grad_norm": 0.3264070723484753, "learning_rate": 6.896421831123782e-07, "loss": -0.0, "num_tokens": 71204495.0, "reward": 0.703125, "reward_std": 0.11705183982849121, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 443 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4083860990095324e-09, "advantages/std": 0.4959520101547241, "advantages/var": 0.24596839637651158, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.896478121664888, "grad_norm": 0.23071652035781837, "learning_rate": 6.884045478736731e-07, "loss": -0.0, "num_tokens": 71349483.0, "reward": 0.70703125, "reward_std": 0.11363443732261658, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 444 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.129211343901943e-09, "advantages/std": 0.6185661554336548, "advantages/var": 0.38262408864797237, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.9007470651013874, "grad_norm": 0.26391827209425767, "learning_rate": 6.871655657976681e-07, "loss": -0.0, "num_tokens": 71532006.0, "reward": 0.61328125, "reward_std": 0.18201632797718048, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 445 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.3436707446085615e-09, "advantages/std": 0.5960665941238403, "advantages/var": 0.355295384630395, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.9050160085378869, "grad_norm": 0.2605638541707349, "learning_rate": 6.859252457414066e-07, "loss": 0.0, "num_tokens": 71686355.0, "reward": 0.72265625, "reward_std": 0.1720261573791504, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 446 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 4.8931765453848975e-09, "advantages/std": 0.6185753345489502, "advantages/var": 0.38263544451234566, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.9092849519743864, "grad_norm": 0.26190728557911336, "learning_rate": 6.84683596571497e-07, "loss": 0.0, "num_tokens": 71852703.0, "reward": 0.65234375, "reward_std": 0.19450394809246063, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 447 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6985172313936096e-09, "advantages/std": 0.5483150482177734, "advantages/var": 0.3006493921020592, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.9135538954108857, "grad_norm": 0.23280151099368152, "learning_rate": 6.834406271640487e-07, "loss": -0.0, "num_tokens": 72011642.0, "reward": 0.66796875, "reward_std": 0.15965083241462708, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 448 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.779101397944587e-09, "advantages/std": 0.36966031789779663, "advantages/var": 0.13664875062830006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 1.9178228388473852, "grad_norm": 0.17630788878023093, "learning_rate": 6.821963464046095e-07, "loss": 0.0, "num_tokens": 72147983.0, "reward": 0.69921875, "reward_std": 0.062077511101961136, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 449 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.449885533156584e-09, "advantages/std": 0.404936283826828, "advantages/var": 0.1639733939594814, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.9220917822838848, "grad_norm": 0.18605668705259262, "learning_rate": 6.809507631881013e-07, "loss": -0.0, "num_tokens": 72302131.0, "reward": 0.67578125, "reward_std": 0.06970866024494171, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 450 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.318148370494186e-09, "advantages/std": 0.5726792216300964, "advantages/var": 0.3279614908868531, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.926360725720384, "grad_norm": 0.29718149010370043, "learning_rate": 6.797038864187563e-07, "loss": 0.0, "num_tokens": 72444448.0, "reward": 0.734375, "reward_std": 0.154791921377182, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 451 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.8181833977205966e-09, "advantages/std": 0.6402837038040161, "advantages/var": 0.40996322135698904, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.9306296691568838, "grad_norm": 0.30249912557639164, "learning_rate": 6.78455725010055e-07, "loss": 0.0, "num_tokens": 72609549.0, "reward": 0.66015625, "reward_std": 0.20490585267543793, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 452 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.7342839024723756e-09, "advantages/std": 0.5960662961006165, "advantages/var": 0.35529502934710777, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.9348986125933831, "grad_norm": 0.2547498641016505, "learning_rate": 6.772062878846603e-07, "loss": 0.0, "num_tokens": 72767868.0, "reward": 0.65625, "reward_std": 0.1714957356452942, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 453 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 6.29830531336252e-09, "advantages/std": 0.36967188119888306, "advantages/var": 0.1366572997491211, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 1.9391675560298824, "grad_norm": 0.17803173936691005, "learning_rate": 6.759555839743549e-07, "loss": 0.0, "num_tokens": 72917624.0, "reward": 0.6640625, "reward_std": 0.07061977684497833, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 454 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907301639445506e-10, "advantages/std": 0.522786021232605, "advantages/var": 0.2733052239962177, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.9434364994663822, "grad_norm": 0.23256361730744385, "learning_rate": 6.747036222199783e-07, "loss": -0.0, "num_tokens": 73077423.0, "reward": 0.65234375, "reward_std": 0.1332252323627472, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 455 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.1640342645653495e-09, "advantages/std": 0.4959566295146942, "advantages/var": 0.24597297835957566, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.9477054429028815, "grad_norm": 0.23992973512319926, "learning_rate": 6.734504115713602e-07, "loss": -0.0, "num_tokens": 73237735.0, "reward": 0.71875, "reward_std": 0.11822889000177383, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 456 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.163920693670164e-09, "advantages/std": 0.4959675371646881, "advantages/var": 0.24598379792120628, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.951974386339381, "grad_norm": 0.24022370638212284, "learning_rate": 6.721959609872598e-07, "loss": 0.0, "num_tokens": 73394434.0, "reward": 0.75390625, "reward_std": 0.1267760694026947, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 457 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5056151252025905e-09, "advantages/std": 0.6185661554336548, "advantages/var": 0.38262408864797237, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.9562433297758806, "grad_norm": 0.2696626607781293, "learning_rate": 6.709402794352992e-07, "loss": 0.0, "num_tokens": 73568998.0, "reward": 0.66015625, "reward_std": 0.18201632797718048, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 458 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.0327421924445997e-09, "advantages/std": 0.5727008581161499, "advantages/var": 0.32798627288697446, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.9605122732123799, "grad_norm": 0.2694609947093644, "learning_rate": 6.696833758919005e-07, "loss": -0.0, "num_tokens": 73714000.0, "reward": 0.75390625, "reward_std": 0.18094666302204132, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 459 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.3361051476250139e-09, "advantages/std": 0.5227821469306946, "advantages/var": 0.27330117314946634, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.9647812166488794, "grad_norm": 0.2129444098803062, "learning_rate": 6.684252593422213e-07, "loss": -0.0, "num_tokens": 73875813.0, "reward": 0.68359375, "reward_std": 0.1275724172592163, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 460 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.129194478589663e-09, "advantages/std": 0.618575394153595, "advantages/var": 0.3826355182522754, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.969050160085379, "grad_norm": 0.26853815528519137, "learning_rate": 6.671659387800908e-07, "loss": 0.0, "num_tokens": 74037738.0, "reward": 0.60546875, "reward_std": 0.19450394809246063, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 461 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.633534917323103e-09, "advantages/std": 0.495952844619751, "advantages/var": 0.24596922408642286, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.9733191035218782, "grad_norm": 0.2702401086641601, "learning_rate": 6.659054232079452e-07, "loss": 0.0, "num_tokens": 74175028.0, "reward": 0.79296875, "reward_std": 0.11310647428035736, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 462 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.000009060457362e-09, "advantages/std": 0.6402828097343445, "advantages/var": 0.4099620764413068, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 1.9775880469583778, "grad_norm": 0.3298380348570836, "learning_rate": 6.646437216367633e-07, "loss": 0.0, "num_tokens": 74337345.0, "reward": 0.578125, "reward_std": 0.20490339398384094, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49482619762420654, "step": 463 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.898997469490353e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.9818569903948773, "grad_norm": 0.22513828791113058, "learning_rate": 6.633808430860019e-07, "loss": 0.0, "num_tokens": 74485423.0, "reward": 0.671875, "reward_std": 0.1344047486782074, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 464 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.9724516843147123e-09, "advantages/std": 0.5483064651489258, "advantages/var": 0.30063997972411016, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.9861259338313766, "grad_norm": 0.25301570427584996, "learning_rate": 6.621167965835322e-07, "loss": -0.0, "num_tokens": 74634718.0, "reward": 0.609375, "reward_std": 0.15163654088974, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 465 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.3472887252249853e-09, "advantages/std": 0.49595654010772705, "advantages/var": 0.24597288967562747, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.9903948772678763, "grad_norm": 0.2737895419841632, "learning_rate": 6.608515911655743e-07, "loss": 0.0, "num_tokens": 74784603.0, "reward": 0.703125, "reward_std": 0.1165238693356514, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 466 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.45366097526879e-10, "advantages/std": 0.5227848291397095, "advantages/var": 0.2733039775786352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 1.9946638207043756, "grad_norm": 0.2609088064315718, "learning_rate": 6.595852358766333e-07, "loss": -0.0, "num_tokens": 74927831.0, "reward": 0.6953125, "reward_std": 0.13151778280735016, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 467 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.131246346616979e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.9989327641408752, "grad_norm": 0.29652117636771275, "learning_rate": 6.583177397694337e-07, "loss": 0.0, "num_tokens": 75075603.0, "reward": 0.8046875, "reward_std": 0.15650182962417603, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 468 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.218888138077328e-09, "advantages/std": 0.5483006238937378, "advantages/var": 0.3006335741622621, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 2.0042689434364993, "grad_norm": 0.3051475077518512, "learning_rate": 6.570491119048558e-07, "loss": 0.0, "num_tokens": 75201411.0, "reward": 0.78125, "reward_std": 0.1442737877368927, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 469 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439363748430499e-09, "advantages/std": 0.5726836919784546, "advantages/var": 0.32796661105807345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 2.008537886872999, "grad_norm": 0.2751378768343912, "learning_rate": 6.557793613518703e-07, "loss": 0.0, "num_tokens": 75352067.0, "reward": 0.58984375, "reward_std": 0.15874217450618744, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 470 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721843985338186e-09, "advantages/std": 0.5227872133255005, "advantages/var": 0.27330647041664236, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.0128068303094984, "grad_norm": 0.2549435165894436, "learning_rate": 6.545084971874736e-07, "loss": -0.0, "num_tokens": 75507484.0, "reward": 0.7265625, "reward_std": 0.1349327266216278, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 471 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.1248571416022253e-09, "advantages/std": 0.5960736870765686, "advantages/var": 0.35530384042505503, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.0170757737459977, "grad_norm": 0.29969731602875654, "learning_rate": 6.532365284966232e-07, "loss": 0.0, "num_tokens": 75660148.0, "reward": 0.734375, "reward_std": 0.18056842684745789, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 472 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 1.0615912720815922e-08, "advantages/std": 0.5483057498931885, "advantages/var": 0.30063919536593176, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.0213447171824974, "grad_norm": 0.29207596259062774, "learning_rate": 6.51963464372172e-07, "loss": 0.0, "num_tokens": 75805634.0, "reward": 0.6796875, "reward_std": 0.14887069165706635, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 473 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.687273994754044e-09, "advantages/std": 0.596075177192688, "advantages/var": 0.3553056168652944, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 2.0256136606189967, "grad_norm": 0.25542200864749953, "learning_rate": 6.50689313914804e-07, "loss": 0.0, "num_tokens": 75962422.0, "reward": 0.734375, "reward_std": 0.1812175214290619, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 474 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.570994047501006e-09, "advantages/std": 0.5228006839752197, "advantages/var": 0.27332055516495757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.0298826040554965, "grad_norm": 0.33695170726332435, "learning_rate": 6.494140862329687e-07, "loss": 0.0, "num_tokens": 76126420.0, "reward": 0.59765625, "reward_std": 0.14966067671775818, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4913311004638672, "step": 475 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4374101758003235, "advantages/var": 0.1913276618936699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.034151547491996, "grad_norm": 0.17944291081012842, "learning_rate": 6.48137790442817e-07, "loss": -0.0, "num_tokens": 76278734.0, "reward": 0.6640625, "reward_std": 0.10691440105438232, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 476 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.323050633327592e-10, "advantages/std": 0.437400758266449, "advantages/var": 0.19131942333206453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.038420490928495, "grad_norm": 0.2104308698623457, "learning_rate": 6.468604356681347e-07, "loss": 0.0, "num_tokens": 76419959.0, "reward": 0.7734375, "reward_std": 0.09890255331993103, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 477 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.49597543478012085, "advantages/var": 0.2459916319053299, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.042689434364995, "grad_norm": 0.21642043572235753, "learning_rate": 6.45582031040278e-07, "loss": -0.0, "num_tokens": 76584561.0, "reward": 0.703125, "reward_std": 0.13755130767822266, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 478 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.1718066640118346e-09, "advantages/std": 0.5960811972618103, "advantages/var": 0.3553127937290732, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.046958377801494, "grad_norm": 0.3204108965931746, "learning_rate": 6.443025856981084e-07, "loss": -0.0, "num_tokens": 76727032.0, "reward": 0.7890625, "reward_std": 0.18964111804962158, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 479 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9875465035951096e-09, "advantages/std": 0.46760237216949463, "advantages/var": 0.21865197845853857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.0512273212379935, "grad_norm": 0.23652859766283577, "learning_rate": 6.430221087879271e-07, "loss": -0.0, "num_tokens": 76866220.0, "reward": 0.74609375, "reward_std": 0.114015132188797, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 480 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9916905604763657e-09, "advantages/std": 0.46760404109954834, "advantages/var": 0.2186535392526281, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.055496264674493, "grad_norm": 0.24549528815198077, "learning_rate": 6.417406094634089e-07, "loss": -0.0, "num_tokens": 77014076.0, "reward": 0.64453125, "reward_std": 0.11625301837921143, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 481 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.527941429292418e-09, "advantages/std": 0.6185771822929382, "advantages/var": 0.38263773045347094, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.0597652081109925, "grad_norm": 0.29132209347309834, "learning_rate": 6.404580968855384e-07, "loss": -0.0, "num_tokens": 77182637.0, "reward": 0.69921875, "reward_std": 0.19780266284942627, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 482 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.286193157034731e-09, "advantages/std": 0.49595820903778076, "advantages/var": 0.24597454511196304, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.064034151547492, "grad_norm": 0.247701806335025, "learning_rate": 6.391745802225434e-07, "loss": -0.0, "num_tokens": 77327901.0, "reward": 0.734375, "reward_std": 0.12046677619218826, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 483 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.030967940225528e-09, "advantages/std": 0.5960703492164612, "advantages/var": 0.355299861215034, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.0683030949839916, "grad_norm": 0.3241457763923432, "learning_rate": 6.378900686498288e-07, "loss": 0.0, "num_tokens": 77484223.0, "reward": 0.7421875, "reward_std": 0.17662061750888824, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 484 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 8.917319186133034e-09, "advantages/std": 0.5483086705207825, "advantages/var": 0.300642398168268, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.072572038420491, "grad_norm": 0.26748724345829983, "learning_rate": 6.366045713499128e-07, "loss": 0.0, "num_tokens": 77636489.0, "reward": 0.8203125, "reward_std": 0.1533464789390564, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 485 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 6.298352535277312e-09, "advantages/std": 0.369669109582901, "advantages/var": 0.13665525057981487, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.07684098185699, "grad_norm": 0.16791198948334604, "learning_rate": 6.353180975123594e-07, "loss": 0.0, "num_tokens": 77785589.0, "reward": 0.73828125, "reward_std": 0.06944026052951813, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 486 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958618460012092e-10, "advantages/std": 0.46759626269340515, "advantages/var": 0.21864626488483996, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 2.08110992529349, "grad_norm": 0.21844686995293106, "learning_rate": 6.340306563337141e-07, "loss": -0.0, "num_tokens": 77934274.0, "reward": 0.63671875, "reward_std": 0.10941823571920395, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 487 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.845910451722217e-09, "advantages/std": 0.572686493396759, "advantages/var": 0.3279698197190761, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.0853788687299892, "grad_norm": 0.27969529596515874, "learning_rate": 6.327422570174372e-07, "loss": -0.0, "num_tokens": 78088673.0, "reward": 0.69140625, "reward_std": 0.1632179617881775, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 488 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 8.20278608731468e-09, "advantages/std": 0.5960710644721985, "advantages/var": 0.3553007139010198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.089647812166489, "grad_norm": 0.25314858952478914, "learning_rate": 6.314529087738386e-07, "loss": 0.0, "num_tokens": 78252846.0, "reward": 0.765625, "reward_std": 0.17609265446662903, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 489 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.3376001869689765e-09, "advantages/std": 0.6612836718559265, "advantages/var": 0.4372960946632567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.0939167556029883, "grad_norm": 0.28461404439506616, "learning_rate": 6.301626208200115e-07, "loss": 0.0, "num_tokens": 78407410.0, "reward": 0.75390625, "reward_std": 0.2205488383769989, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 490 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 7.511337010179756e-09, "advantages/std": 0.4959556758403778, "advantages/var": 0.2459720323982859, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.0981856990394876, "grad_norm": 0.20324479481628369, "learning_rate": 6.288714023797671e-07, "loss": -0.0, "num_tokens": 78560524.0, "reward": 0.7578125, "reward_std": 0.11705183982849121, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 491 }, { "advantages/mean": 4.889443516731262e-09, "advantages/snr": 8.917142764397378e-09, "advantages/std": 0.5483195185661316, "advantages/var": 0.3006542944405943, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.1024546424759873, "grad_norm": 0.24222228283880398, "learning_rate": 6.275792626835679e-07, "loss": -0.0, "num_tokens": 78715417.0, "reward": 0.62109375, "reward_std": 0.16477571427822113, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 492 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5483072400093079, "advantages/var": 0.30064082944662474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.1067235859124867, "grad_norm": 0.2696121215659343, "learning_rate": 6.262862109684625e-07, "loss": -0.0, "num_tokens": 78864264.0, "reward": 0.6953125, "reward_std": 0.15110859274864197, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 493 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 9.79779051410865e-09, "advantages/std": 0.5227988958358765, "advantages/var": 0.2733186854872116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.110992529348986, "grad_norm": 0.22331423821502822, "learning_rate": 6.249922564780192e-07, "loss": -0.0, "num_tokens": 79021568.0, "reward": 0.71875, "reward_std": 0.14689236879348755, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 494 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755670987579601e-09, "advantages/std": 0.49595534801483154, "advantages/var": 0.24597170722451267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.1152614727854857, "grad_norm": 0.2496596017738257, "learning_rate": 6.236974084622597e-07, "loss": 0.0, "num_tokens": 79173348.0, "reward": 0.67578125, "reward_std": 0.11652141809463501, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 495 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 7.452419155772434e-09, "advantages/std": 0.43739205598831177, "advantages/var": 0.19131181064168246, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.119530416221985, "grad_norm": 0.12826516345732708, "learning_rate": 6.224016761775932e-07, "loss": 0.0, "num_tokens": 79326972.0, "reward": 0.76953125, "reward_std": 0.09153735637664795, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 496 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.388838395442294e-10, "advantages/std": 0.49597325921058655, "advantages/var": 0.24598947385197167, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.1237993596584843, "grad_norm": 0.2844272076231935, "learning_rate": 6.211050688867503e-07, "loss": -0.0, "num_tokens": 79483123.0, "reward": 0.73828125, "reward_std": 0.13466677069664001, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 497 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.2250808180061485e-09, "advantages/std": 0.3306407332420349, "advantages/var": 0.10932329447883049, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.128068303094984, "grad_norm": 0.15637960274214446, "learning_rate": 6.198075958587167e-07, "loss": 0.0, "num_tokens": 79636927.0, "reward": 0.74609375, "reward_std": 0.05497432500123978, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 498 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.9060327590462414e-10, "advantages/std": 0.5960795879364014, "advantages/var": 0.35531087515443005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 2.1323372465314834, "grad_norm": 0.3053836619718508, "learning_rate": 6.18509266368667e-07, "loss": 0.0, "num_tokens": 79798753.0, "reward": 0.66796875, "reward_std": 0.18687278032302856, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 499 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876135291586783e-09, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.136606189967983, "grad_norm": 0.2533729605115467, "learning_rate": 6.172100896978985e-07, "loss": 0.0, "num_tokens": 79943968.0, "reward": 0.73828125, "reward_std": 0.10376540571451187, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 500 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1231767740615974e-09, "advantages/std": 0.5483072400093079, "advantages/var": 0.30064082944662474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.1408751334044824, "grad_norm": 0.33647486360513806, "learning_rate": 6.159100751337641e-07, "loss": -0.0, "num_tokens": 80088298.0, "reward": 0.7890625, "reward_std": 0.15110856294631958, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 501 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.481332608054646e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.1451440768409817, "grad_norm": 0.28214034878312183, "learning_rate": 6.146092319696072e-07, "loss": -0.0, "num_tokens": 80234565.0, "reward": 0.6953125, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 502 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.489668896152697e-09, "advantages/std": 0.46759358048439026, "advantages/var": 0.21864375651021195, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 2.1494130202774815, "grad_norm": 0.23343503528779358, "learning_rate": 6.133075695046943e-07, "loss": 0.0, "num_tokens": 80368916.0, "reward": 0.78515625, "reward_std": 0.1060032919049263, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 503 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.140424438653788e-09, "advantages/std": 0.6185687184333801, "advantages/var": 0.3826272594243143, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.153681963713981, "grad_norm": 0.2595846374198061, "learning_rate": 6.120050970441485e-07, "loss": -0.0, "num_tokens": 80525671.0, "reward": 0.67578125, "reward_std": 0.18649208545684814, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 504 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.749637419262528e-10, "advantages/std": 0.4049483835697174, "advantages/var": 0.16398319335572697, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.15795090715048, "grad_norm": 0.19438375634725735, "learning_rate": 6.107018238988837e-07, "loss": -0.0, "num_tokens": 80670529.0, "reward": 0.74609375, "reward_std": 0.07995839416980743, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 505 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.6615187883262115e-09, "advantages/std": 0.43740183115005493, "advantages/var": 0.19132036189342116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.16221985058698, "grad_norm": 0.22311545662022994, "learning_rate": 6.093977593855375e-07, "loss": 0.0, "num_tokens": 80819363.0, "reward": 0.7578125, "reward_std": 0.10007961094379425, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 506 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.794238057725804e-09, "advantages/std": 0.5483013987541199, "advantages/var": 0.30063442387572437, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.166488794023479, "grad_norm": 0.22187008401811345, "learning_rate": 6.080929128264045e-07, "loss": 0.0, "num_tokens": 80982643.0, "reward": 0.7265625, "reward_std": 0.14545084536075592, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 507 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962789174963654e-09, "advantages/std": 0.46759456396102905, "advantages/var": 0.2186446762459049, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.1707577374599785, "grad_norm": 0.2494787558010566, "learning_rate": 6.067872935493702e-07, "loss": 0.0, "num_tokens": 81118448.0, "reward": 0.80078125, "reward_std": 0.10718034207820892, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 508 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562928780215032e-09, "advantages/std": 0.5227848291397095, "advantages/var": 0.2733039775786352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.175026680896478, "grad_norm": 0.28498267319507764, "learning_rate": 6.054809108878437e-07, "loss": 0.0, "num_tokens": 81263747.0, "reward": 0.75, "reward_std": 0.13151778280735016, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 509 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.919888970950097e-09, "advantages/std": 0.43740561604499817, "advantages/var": 0.19132367294770436, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.1792956243329775, "grad_norm": 0.21060269613003632, "learning_rate": 6.041737741806913e-07, "loss": 0.0, "num_tokens": 81408764.0, "reward": 0.78125, "reward_std": 0.10296659171581268, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 510 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.816804076067719e-09, "advantages/std": 0.33063092827796936, "advantages/var": 0.10931681073395172, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.1835645677694773, "grad_norm": 0.14004657643654717, "learning_rate": 6.028658927721697e-07, "loss": -0.0, "num_tokens": 81541398.0, "reward": 0.80078125, "reward_std": 0.04761157184839249, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 511 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196592786189577e-09, "advantages/std": 0.5726943016052246, "advantages/var": 0.32797876309109597, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.1878335112059766, "grad_norm": 0.3074099145428851, "learning_rate": 6.015572760118596e-07, "loss": 0.0, "num_tokens": 81702611.0, "reward": 0.67578125, "reward_std": 0.17399565875530243, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 512 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.877807622595892e-09, "advantages/std": 0.4959627091884613, "advantages/var": 0.24597900890555824, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.192102454642476, "grad_norm": 0.2443639003613943, "learning_rate": 6.002479332545981e-07, "loss": 0.0, "num_tokens": 81850958.0, "reward": 0.70703125, "reward_std": 0.12335620820522308, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 513 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.8168092671155283e-09, "advantages/std": 0.4959454834461212, "advantages/var": 0.2459619225506069, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 2.1963713980789756, "grad_norm": 0.18910867071039786, "learning_rate": 5.989378738604121e-07, "loss": 0.0, "num_tokens": 82004261.0, "reward": 0.73828125, "reward_std": 0.10627168416976929, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 514 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.45006423038189e-09, "advantages/std": 0.4959668219089508, "advantages/var": 0.24598308843446492, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.200640341515475, "grad_norm": 0.19823134761784342, "learning_rate": 5.976271071944516e-07, "loss": 0.0, "num_tokens": 82159605.0, "reward": 0.69140625, "reward_std": 0.12900903820991516, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 515 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.481341176536925e-09, "advantages/std": 0.46760013699531555, "advantages/var": 0.21864988811803787, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.2049092849519742, "grad_norm": 0.21387409554665499, "learning_rate": 5.963156426269227e-07, "loss": -0.0, "num_tokens": 82300202.0, "reward": 0.796875, "reward_std": 0.11283563077449799, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 516 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721843985338186e-09, "advantages/std": 0.5227872133255005, "advantages/var": 0.27330647041664236, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.209178228388474, "grad_norm": 0.2458970836003116, "learning_rate": 5.950034895330204e-07, "loss": -0.0, "num_tokens": 82450773.0, "reward": 0.7578125, "reward_std": 0.1349327266216278, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 517 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.492866815227978e-10, "advantages/std": 0.5482969284057617, "advantages/var": 0.300629521699193, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.2134471718249733, "grad_norm": 0.2448989545301728, "learning_rate": 5.936906572928624e-07, "loss": -0.0, "num_tokens": 82591859.0, "reward": 0.71875, "reward_std": 0.14032597839832306, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 518 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5960641503334045, "advantages/var": 0.3552924713126835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.2177161152614726, "grad_norm": 0.3312415074798024, "learning_rate": 5.923771552914201e-07, "loss": -0.0, "num_tokens": 82763398.0, "reward": 0.5546875, "reward_std": 0.16808080673217773, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49797385931015015, "step": 519 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.472177112287613e-09, "advantages/std": 0.5726823806762695, "advantages/var": 0.3279651091370397, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.2219850586979724, "grad_norm": 0.2937235840964889, "learning_rate": 5.91062992918454e-07, "loss": 0.0, "num_tokens": 82914373.0, "reward": 0.74609375, "reward_std": 0.15820932388305664, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 520 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.572451479362036e-09, "advantages/std": 0.4959532916545868, "advantages/var": 0.24596966750301963, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.2262540021344717, "grad_norm": 0.21928040534652227, "learning_rate": 5.897481795684446e-07, "loss": -0.0, "num_tokens": 83078131.0, "reward": 0.703125, "reward_std": 0.115341916680336, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 521 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2524805971912827e-09, "advantages/std": 0.5726844668388367, "advantages/var": 0.3279674985584826, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.2305229455709714, "grad_norm": 0.284497900137537, "learning_rate": 5.884327246405262e-07, "loss": -0.0, "num_tokens": 83237505.0, "reward": 0.65234375, "reward_std": 0.15991924703121185, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 522 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755668505089878e-09, "advantages/std": 0.4959556758403778, "advantages/var": 0.2459720323982859, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.2347918890074707, "grad_norm": 0.25720163427951526, "learning_rate": 5.8711663753842e-07, "loss": 0.0, "num_tokens": 83375158.0, "reward": 0.8125, "reward_std": 0.11705183982849121, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.3910769522190094, "step": 523 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.225095585998201e-09, "advantages/std": 0.4959593713283539, "advantages/var": 0.245975698008416, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.23906083244397, "grad_norm": 0.25599797050896705, "learning_rate": 5.857999276703657e-07, "loss": 0.0, "num_tokens": 83542285.0, "reward": 0.64453125, "reward_std": 0.12046922743320465, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 524 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.252482966806137e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.24332977588047, "grad_norm": 0.29768190456929716, "learning_rate": 5.84482604449055e-07, "loss": 0.0, "num_tokens": 83686746.0, "reward": 0.75, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 525 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.4083492500432603e-09, "advantages/std": 0.6612866520881653, "advantages/var": 0.43730003622997415, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.247598719316969, "grad_norm": 0.32444568808383006, "learning_rate": 5.83164677291565e-07, "loss": 0.0, "num_tokens": 83839387.0, "reward": 0.67578125, "reward_std": 0.22449666261672974, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 526 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 9.958571491388554e-09, "advantages/std": 0.46759846806526184, "advantages/var": 0.2186483273369797, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 2.2518676627534684, "grad_norm": 0.20529368451226218, "learning_rate": 5.818461556192892e-07, "loss": -0.0, "num_tokens": 84014173.0, "reward": 0.578125, "reward_std": 0.11059774458408356, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49482619762420654, "step": 527 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5482991337776184, "advantages/var": 0.3006319401012867, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.256136606189968, "grad_norm": 0.26826764902683475, "learning_rate": 5.805270488578714e-07, "loss": -0.0, "num_tokens": 84163860.0, "reward": 0.7578125, "reward_std": 0.14203590154647827, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 528 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.958740326614902e-10, "advantages/std": 0.4675905406475067, "advantages/var": 0.21864091370302763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.2604055496264674, "grad_norm": 0.28990349934920284, "learning_rate": 5.792073664371383e-07, "loss": 0.0, "num_tokens": 84307791.0, "reward": 0.6953125, "reward_std": 0.10205793380737305, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 529 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.123717230213227e-09, "advantages/std": 0.6816261410713196, "advantages/var": 0.46461419619177846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.2646744930629668, "grad_norm": 0.6035433531569014, "learning_rate": 5.778871177910315e-07, "loss": -0.0, "num_tokens": 84478520.0, "reward": 0.70703125, "reward_std": 0.22081723809242249, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 530 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.979264165711202e-10, "advantages/std": 0.4676004946231842, "advantages/var": 0.21865022257184652, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.2689434364994665, "grad_norm": 0.23720868638509054, "learning_rate": 5.7656631235754e-07, "loss": 0.0, "num_tokens": 84623133.0, "reward": 0.71484375, "reward_std": 0.113366037607193, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 531 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.323178301971e-10, "advantages/std": 0.4373902678489685, "advantages/var": 0.1913102464089924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.273212379935966, "grad_norm": 0.1800617723214338, "learning_rate": 5.752449595786341e-07, "loss": 0.0, "num_tokens": 84771437.0, "reward": 0.71484375, "reward_std": 0.08929946273565292, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 532 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.43739479780197144, "advantages/var": 0.19131420914422748, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.2774813233724656, "grad_norm": 0.24425106303274358, "learning_rate": 5.739230689001955e-07, "loss": 0.0, "num_tokens": 84929213.0, "reward": 0.69921875, "reward_std": 0.09324727952480316, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 533 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262301431742259e-09, "advantages/std": 0.5726880431175232, "advantages/var": 0.3279715947297781, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 2.281750266808965, "grad_norm": 0.26506567231170286, "learning_rate": 5.726006497719524e-07, "loss": -0.0, "num_tokens": 85093340.0, "reward": 0.62890625, "reward_std": 0.16557206213474274, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 534 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.979420942197835e-10, "advantages/std": 0.4675857722759247, "advantages/var": 0.2186364544348729, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 2.286019210245464, "grad_norm": 0.20899718020336794, "learning_rate": 5.712777116474102e-07, "loss": 0.0, "num_tokens": 85244179.0, "reward": 0.73828125, "reward_std": 0.09916850179433823, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 535 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 8.049523192160078e-09, "advantages/std": 0.40494683384895325, "advantages/var": 0.16398193824429175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.2902881536819635, "grad_norm": 0.19129106216091937, "learning_rate": 5.699542639837843e-07, "loss": 0.0, "num_tokens": 85388966.0, "reward": 0.8125, "reward_std": 0.07825092226266861, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.3910769522190094, "step": 536 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5483127236366272, "advantages/var": 0.3006468429018163, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.2945570971184632, "grad_norm": 0.25350005366655237, "learning_rate": 5.686303162419324e-07, "loss": 0.0, "num_tokens": 85550800.0, "reward": 0.67578125, "reward_std": 0.15623590350151062, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 537 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 5.164009439899339e-09, "advantages/std": 0.49595901370048523, "advantages/var": 0.2459753432707581, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 2.2988260405549625, "grad_norm": 0.2590780841185054, "learning_rate": 5.673058778862878e-07, "loss": -0.0, "num_tokens": 85718806.0, "reward": 0.6484375, "reward_std": 0.11993881314992905, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 538 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 5.1746161723483324e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.3030949839914623, "grad_norm": 0.16751371969958892, "learning_rate": 5.659809583847907e-07, "loss": 0.0, "num_tokens": 85878807.0, "reward": 0.640625, "reward_std": 0.08337579667568207, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 539 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1291852300839092e-09, "advantages/std": 0.6185804605484009, "advantages/var": 0.38264178617227174, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.3073639274279616, "grad_norm": 0.29921396743821344, "learning_rate": 5.646555672088202e-07, "loss": 0.0, "num_tokens": 86036114.0, "reward": 0.67578125, "reward_std": 0.20345547795295715, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 540 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0688468277682715e-08, "advantages/std": 0.5228003859519958, "advantages/var": 0.2733202435515558, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 2.311632870864461, "grad_norm": 0.18961370483557366, "learning_rate": 5.633297138331284e-07, "loss": 0.0, "num_tokens": 86212591.0, "reward": 0.7109375, "reward_std": 0.14913025498390198, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 541 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.323123171578266e-09, "advantages/std": 0.43739479780197144, "advantages/var": 0.19131420914422748, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.3159018143009606, "grad_norm": 0.21369706487245865, "learning_rate": 5.620034077357707e-07, "loss": 0.0, "num_tokens": 86356705.0, "reward": 0.80859375, "reward_std": 0.09324727952480316, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 542 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.473288895977308e-09, "advantages/std": 0.4675827622413635, "advantages/var": 0.2186336395452635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.32017075773746, "grad_norm": 0.25034432748651897, "learning_rate": 5.606766583980389e-07, "loss": 0.0, "num_tokens": 86514364.0, "reward": 0.6640625, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 543 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562896688905427e-09, "advantages/std": 0.5227895379066467, "advantages/var": 0.2733089009446452, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.3244397011739593, "grad_norm": 0.2590621429358719, "learning_rate": 5.593494753043937e-07, "loss": 0.0, "num_tokens": 86681566.0, "reward": 0.6171875, "reward_std": 0.13664263486862183, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 544 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.225151187992327e-09, "advantages/std": 0.495952844619751, "advantages/var": 0.24596922408642286, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.328708644610459, "grad_norm": 0.2321484076747434, "learning_rate": 5.580218679423964e-07, "loss": -0.0, "num_tokens": 86816847.0, "reward": 0.79296875, "reward_std": 0.11310647428035736, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 545 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958639405622186e-10, "advantages/std": 0.46759527921676636, "advantages/var": 0.2186453451458057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 2.3329775880469583, "grad_norm": 0.24188330925260715, "learning_rate": 5.56693845802641e-07, "loss": -0.0, "num_tokens": 86987597.0, "reward": 0.69140625, "reward_std": 0.10824117809534073, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 546 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.979341600636731e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.3372465314834576, "grad_norm": 0.17450798270363213, "learning_rate": 5.553654183786871e-07, "loss": -0.0, "num_tokens": 87155677.0, "reward": 0.671875, "reward_std": 0.1054728701710701, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 547 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.298381478091215e-10, "advantages/std": 0.3696674108505249, "advantages/var": 0.13665399464493078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.3415154749199574, "grad_norm": 0.16176405546533756, "learning_rate": 5.540365951669912e-07, "loss": -0.0, "num_tokens": 87298798.0, "reward": 0.734375, "reward_std": 0.0677327960729599, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 548 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.9061065803549066e-10, "advantages/std": 0.5960683226585388, "advantages/var": 0.35529744527696394, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.3457844183564567, "grad_norm": 0.2665223749786248, "learning_rate": 5.527073856668391e-07, "loss": -0.0, "num_tokens": 87478889.0, "reward": 0.6328125, "reward_std": 0.1732056736946106, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 549 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.1293189085785e-09, "advantages/std": 0.43738049268722534, "advantages/var": 0.19130169538331998, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.3500533617929564, "grad_norm": 0.2916496247007535, "learning_rate": 5.51377799380278e-07, "loss": -0.0, "num_tokens": 87626632.0, "reward": 0.65625, "reward_std": 0.08075720071792603, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 550 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.468468828956508e-09, "advantages/std": 0.5960770845413208, "advantages/var": 0.3553078907152809, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 2.3543223052294557, "grad_norm": 0.2702531847026088, "learning_rate": 5.500478458120493e-07, "loss": -0.0, "num_tokens": 87803587.0, "reward": 0.640625, "reward_std": 0.18451623618602753, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 551 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235199298796701e-09, "advantages/std": 0.5227786302566528, "advantages/var": 0.27329749625302213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.358591248665955, "grad_norm": 0.2505033849756049, "learning_rate": 5.487175344695187e-07, "loss": -0.0, "num_tokens": 87974455.0, "reward": 0.75, "reward_std": 0.12415501475334167, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 552 }, { "advantages/mean": -7.2177499532699585e-09, "advantages/snr": 1.0589000793862824e-08, "advantages/std": 0.681627094745636, "advantages/var": 0.4646154962913762, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 2.362860192102455, "grad_norm": 0.35984344725559825, "learning_rate": 5.473868748626109e-07, "loss": 0.0, "num_tokens": 88129378.0, "reward": 0.7734375, "reward_std": 0.2225247025489807, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 553 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.450106882338915e-09, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.367129135538954, "grad_norm": 0.25738104886057167, "learning_rate": 5.460558765037392e-07, "loss": 0.0, "num_tokens": 88277805.0, "reward": 0.69140625, "reward_std": 0.1255940943956375, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 554 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.09823736693788e-09, "advantages/std": 0.5726998448371887, "advantages/var": 0.32798511227654004, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.3713980789754534, "grad_norm": 0.3490442107506381, "learning_rate": 5.447245489077388e-07, "loss": 0.0, "num_tokens": 88433920.0, "reward": 0.73046875, "reward_std": 0.17965340614318848, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 555 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.789750026296212e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.375667022411953, "grad_norm": 0.28529706683678874, "learning_rate": 5.433929015917988e-07, "loss": -0.0, "num_tokens": 88596517.0, "reward": 0.65625, "reward_std": 0.13269482553005219, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 556 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.1040569632820925e-08, "advantages/std": 0.5483047366142273, "advantages/var": 0.30063808419359717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.3799359658484525, "grad_norm": 0.2523351201696841, "learning_rate": 5.420609440753935e-07, "loss": 0.0, "num_tokens": 88737878.0, "reward": 0.78515625, "reward_std": 0.14886823296546936, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 557 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.008234554568749e-09, "advantages/std": 0.5227926969528198, "advantages/var": 0.2733122039872029, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.3842049092849518, "grad_norm": 0.28765703107991825, "learning_rate": 5.407286858802147e-07, "loss": 0.0, "num_tokens": 88887978.0, "reward": 0.7265625, "reward_std": 0.13952961564064026, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 558 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.3969473161275114e-09, "advantages/std": 0.5483291149139404, "advantages/var": 0.3006648182623053, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.3884738527214515, "grad_norm": 0.2284492587672275, "learning_rate": 5.393961365301041e-07, "loss": -0.0, "num_tokens": 89058883.0, "reward": 0.62109375, "reward_std": 0.1777912974357605, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 559 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.273916450676268e-09, "advantages/std": 0.5483027696609497, "advantages/var": 0.30063592721786847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.392742796157951, "grad_norm": 0.3098345163048042, "learning_rate": 5.380633055509842e-07, "loss": 0.0, "num_tokens": 89222944.0, "reward": 0.6640625, "reward_std": 0.1459837108850479, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 560 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.763938073065637e-09, "advantages/std": 0.618582546710968, "advantages/var": 0.38264436709542693, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.3970117395944506, "grad_norm": 0.29717892647935373, "learning_rate": 5.36730202470791e-07, "loss": 0.0, "num_tokens": 89380376.0, "reward": 0.67578125, "reward_std": 0.20357662439346313, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 561 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.899073991974699e-09, "advantages/std": 0.5227798223495483, "advantages/var": 0.2732987426558253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 2.40128068303095, "grad_norm": 0.25529005934544713, "learning_rate": 5.35396836819406e-07, "loss": 0.0, "num_tokens": 89535948.0, "reward": 0.69921875, "reward_std": 0.1258624941110611, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 562 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2739268370849381e-09, "advantages/std": 0.5482982993125916, "advantages/var": 0.30063102502908023, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.405549626467449, "grad_norm": 0.3079728383819247, "learning_rate": 5.340632181285871e-07, "loss": 0.0, "num_tokens": 89695747.0, "reward": 0.7265625, "reward_std": 0.14085885882377625, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 563 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.453712769274014e-10, "advantages/std": 0.5227787494659424, "advantages/var": 0.27329762089317455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 2.409818569903949, "grad_norm": 0.28409215670000054, "learning_rate": 5.327293559319013e-07, "loss": 0.0, "num_tokens": 89842574.0, "reward": 0.796875, "reward_std": 0.1258600354194641, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 564 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196682913687935e-09, "advantages/std": 0.5726900696754456, "advantages/var": 0.3279739159048667, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.4140875133404482, "grad_norm": 0.3257234308296981, "learning_rate": 5.313952597646567e-07, "loss": 0.0, "num_tokens": 90003355.0, "reward": 0.63671875, "reward_std": 0.1655769646167755, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 565 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.5201672004113415e-09, "advantages/std": 0.5483164191246033, "advantages/var": 0.3006508954816276, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.4183564567769475, "grad_norm": 0.19915846849528268, "learning_rate": 5.300609391638335e-07, "loss": 0.0, "num_tokens": 90168401.0, "reward": 0.75390625, "reward_std": 0.1618887335062027, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 566 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.877808186781896e-09, "advantages/std": 0.49596256017684937, "advantages/var": 0.24597886109717493, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 2.4226254002134473, "grad_norm": 0.23995835912984506, "learning_rate": 5.287264036680165e-07, "loss": -0.0, "num_tokens": 90325006.0, "reward": 0.74609375, "reward_std": 0.1249450072646141, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 567 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.439358924571688e-09, "advantages/std": 0.5726848244667053, "advantages/var": 0.3279679081744611, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.4268943436499466, "grad_norm": 0.26202942056692013, "learning_rate": 5.273916628173269e-07, "loss": -0.0, "num_tokens": 90488837.0, "reward": 0.6953125, "reward_std": 0.16044965386390686, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 568 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.7248786584154183e-09, "advantages/std": 0.4049513339996338, "advantages/var": 0.16398558290808296, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.431163287086446, "grad_norm": 0.21189495244336484, "learning_rate": 5.260567261533537e-07, "loss": 0.0, "num_tokens": 90639437.0, "reward": 0.77734375, "reward_std": 0.08166831731796265, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 569 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.098269735979208e-09, "advantages/std": 0.5726968050003052, "advantages/var": 0.32798163045755757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.4354322305229457, "grad_norm": 0.25179541657971016, "learning_rate": 5.247216032190853e-07, "loss": 0.0, "num_tokens": 90787086.0, "reward": 0.765625, "reward_std": 0.17464721202850342, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 570 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 9.797886581565322e-09, "advantages/std": 0.5227937698364258, "advantages/var": 0.27331332577978174, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.439701173959445, "grad_norm": 0.2519623444961031, "learning_rate": 5.233863035588426e-07, "loss": 0.0, "num_tokens": 90931241.0, "reward": 0.79296875, "reward_std": 0.13953207433223724, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 571 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599778655449369e-09, "advantages/std": 0.4049423336982727, "advantages/var": 0.16397829362100325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.4439701173959447, "grad_norm": 0.18670377125438592, "learning_rate": 5.220508367182089e-07, "loss": 0.0, "num_tokens": 91081000.0, "reward": 0.79296875, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 572 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.49595820903778076, "advantages/var": 0.24597454511196304, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.448239060832444, "grad_norm": 0.22455891352376486, "learning_rate": 5.207152122439635e-07, "loss": 0.0, "num_tokens": 91240089.0, "reward": 0.671875, "reward_std": 0.12046677619218826, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 573 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.7896780752123385e-09, "advantages/std": 0.5227921605110168, "advantages/var": 0.2733116430917768, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.4525080042689433, "grad_norm": 0.26883169817327435, "learning_rate": 5.193794396840116e-07, "loss": 0.0, "num_tokens": 91384725.0, "reward": 0.83984375, "reward_std": 0.13888297975063324, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.36746934056282043, "step": 574 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.794176017006791e-09, "advantages/std": 0.548306405544281, "advantages/var": 0.30063991436088955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.456776947705443, "grad_norm": 0.23086236527150877, "learning_rate": 5.180435285873181e-07, "loss": -0.0, "num_tokens": 91543106.0, "reward": 0.6875, "reward_std": 0.14993153512477875, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 575 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.547886080441306e-09, "advantages/std": 0.5482913255691528, "advantages/var": 0.30062337769437875, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.4610458911419424, "grad_norm": 0.22269172564936424, "learning_rate": 5.167074885038372e-07, "loss": 0.0, "num_tokens": 91704890.0, "reward": 0.68359375, "reward_std": 0.13349363207817078, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 576 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.3277959731395836e-09, "advantages/std": 0.701404869556427, "advantages/var": 0.4919687910374684, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.4653148345784417, "grad_norm": 0.2822230840932937, "learning_rate": 5.153713289844461e-07, "loss": 0.0, "num_tokens": 91862016.0, "reward": 0.68359375, "reward_std": 0.2585534155368805, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 577 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.168819230075082e-09, "advantages/std": 0.6612796783447266, "advantages/var": 0.437290812991705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.4695837780149414, "grad_norm": 0.28122934660841753, "learning_rate": 5.14035059580875e-07, "loss": 0.0, "num_tokens": 92026532.0, "reward": 0.703125, "reward_std": 0.21489356458187103, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 578 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.2596836075630617e-09, "advantages/std": 0.369665265083313, "advantages/var": 0.13665240820911606, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.4738527214514408, "grad_norm": 0.1976911781209467, "learning_rate": 5.1269868984564e-07, "loss": -0.0, "num_tokens": 92181793.0, "reward": 0.65625, "reward_std": 0.06549490243196487, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 579 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.781509278854418e-09, "advantages/std": 0.5227716565132141, "advantages/var": 0.2732902048535699, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 2.47812166488794, "grad_norm": 0.28255159007796576, "learning_rate": 5.113622293319749e-07, "loss": -0.0, "num_tokens": 92335607.0, "reward": 0.6640625, "reward_std": 0.1173202246427536, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 580 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.1491018816281693e-09, "advantages/std": 0.3696778416633606, "advantages/var": 0.1366617066168807, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.48239060832444, "grad_norm": 0.22949898840587127, "learning_rate": 5.100256875937613e-07, "loss": -0.0, "num_tokens": 92491950.0, "reward": 0.68359375, "reward_std": 0.07680301368236542, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 581 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5483025908470154, "advantages/var": 0.30063573112954955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.486659551760939, "grad_norm": 0.2499851224101172, "learning_rate": 5.086890741854626e-07, "loss": -0.0, "num_tokens": 92640587.0, "reward": 0.78125, "reward_std": 0.14757250249385834, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 582 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6985493589245975e-09, "advantages/std": 0.5483046770095825, "advantages/var": 0.3006380188305826, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.490928495197439, "grad_norm": 0.2736270277798583, "learning_rate": 5.073523986620538e-07, "loss": -0.0, "num_tokens": 92790601.0, "reward": 0.75390625, "reward_std": 0.14716321229934692, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 583 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.3361071279842629e-09, "advantages/std": 0.5227813720703125, "advantages/var": 0.2733003629837185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.495197438633938, "grad_norm": 0.21115201968945138, "learning_rate": 5.060156705789544e-07, "loss": -0.0, "num_tokens": 92958566.0, "reward": 0.67578125, "reward_std": 0.12810036540031433, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 584 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.065611325146664e-10, "advantages/std": 0.5726829767227173, "advantages/var": 0.32796579182799235, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.4994663820704375, "grad_norm": 0.22795342467727167, "learning_rate": 5.046788994919594e-07, "loss": -0.0, "num_tokens": 93128784.0, "reward": 0.68359375, "reward_std": 0.15927013754844666, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 585 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.8182291194331764e-09, "advantages/std": 0.6402790546417236, "advantages/var": 0.4099572678128993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.503735325506937, "grad_norm": 0.29052637275346876, "learning_rate": 5.033420949571712e-07, "loss": 0.0, "num_tokens": 93290584.0, "reward": 0.7109375, "reward_std": 0.19818973541259766, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 586 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.249816775513951e-09, "advantages/std": 0.5960639119148254, "advantages/var": 0.3552921870872048, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.5080042689434365, "grad_norm": 0.3227661999483521, "learning_rate": 5.020052665309311e-07, "loss": 0.0, "num_tokens": 93444117.0, "reward": 0.77734375, "reward_std": 0.16755038499832153, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 587 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.077939062236641e-09, "advantages/std": 0.596068263053894, "advantages/var": 0.3552973742200862, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.512273212379936, "grad_norm": 0.28291810883393154, "learning_rate": 5.006684237697519e-07, "loss": 0.0, "num_tokens": 93600079.0, "reward": 0.6875, "reward_std": 0.1732056736946106, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 588 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.218654289967664e-09, "advantages/std": 0.5483183860778809, "advantages/var": 0.300653052511052, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.5165421558164356, "grad_norm": 0.3046525980956177, "learning_rate": 4.993315762302482e-07, "loss": 0.0, "num_tokens": 93756932.0, "reward": 0.671875, "reward_std": 0.1630682349205017, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 589 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4938096208518726e-09, "advantages/std": 0.46759098768234253, "advantages/var": 0.2186413317617486, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.520811099252935, "grad_norm": 0.23476904635933984, "learning_rate": 4.979947334690689e-07, "loss": -0.0, "num_tokens": 93913252.0, "reward": 0.64453125, "reward_std": 0.10429336130619049, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 590 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.296726261184258e-09, "advantages/std": 0.5960670709609985, "advantages/var": 0.35529595308402406, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.525080042689434, "grad_norm": 0.3037000991415936, "learning_rate": 4.96657905042829e-07, "loss": 0.0, "num_tokens": 94069395.0, "reward": 0.75, "reward_std": 0.1726727932691574, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 591 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.492741255292345e-10, "advantages/std": 0.5483050346374512, "advantages/var": 0.30063841100877653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 2.529348986125934, "grad_norm": 0.221341914770484, "learning_rate": 4.953211005080407e-07, "loss": -0.0, "num_tokens": 94243004.0, "reward": 0.609375, "reward_std": 0.14939865469932556, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 592 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 2.5336179295624333, "grad_norm": 0.2662025714061716, "learning_rate": 4.939843294210455e-07, "loss": -0.0, "num_tokens": 94405400.0, "reward": 0.625, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4850712716579437, "step": 593 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1232210892462864e-09, "advantages/std": 0.548295795917511, "advantages/var": 0.30062827982081686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.537886872998933, "grad_norm": 0.27605722927884196, "learning_rate": 4.926476013379462e-07, "loss": 0.0, "num_tokens": 94555071.0, "reward": 0.66015625, "reward_std": 0.13861851394176483, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 594 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.124909012720456e-09, "advantages/std": 0.5960637927055359, "advantages/var": 0.35529204497450806, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 2.5421558164354323, "grad_norm": 0.2581301367381883, "learning_rate": 4.913109258145374e-07, "loss": 0.0, "num_tokens": 94715089.0, "reward": 0.66796875, "reward_std": 0.16913917660713196, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 595 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.4896708003183657e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.5464247598719316, "grad_norm": 0.20105910467908297, "learning_rate": 4.899743124062388e-07, "loss": 0.0, "num_tokens": 94868432.0, "reward": 0.734375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 596 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.322974107616147e-10, "advantages/std": 0.4374070465564728, "advantages/var": 0.19132492437725634, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.550693703308431, "grad_norm": 0.22232398034984727, "learning_rate": 4.886377706680252e-07, "loss": -0.0, "num_tokens": 95019364.0, "reward": 0.72265625, "reward_std": 0.1046740710735321, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 597 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.252490752707826e-09, "advantages/std": 0.5726826786994934, "advantages/var": 0.3279654504824272, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 2.5549626467449307, "grad_norm": 0.24824824609667348, "learning_rate": 4.873013101543599e-07, "loss": 0.0, "num_tokens": 95202253.0, "reward": 0.53125, "reward_std": 0.15873973071575165, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.5, "step": 598 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.246425098394015e-09, "advantages/std": 0.5482980012893677, "advantages/var": 0.30063069821791544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.55923159018143, "grad_norm": 0.28294020874569886, "learning_rate": 4.859649404191251e-07, "loss": -0.0, "num_tokens": 95365009.0, "reward": 0.75390625, "reward_std": 0.14032843708992004, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 599 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.008258775018605e-09, "advantages/std": 0.5227895379066467, "advantages/var": 0.2733089009446452, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.5635005336179297, "grad_norm": 0.3325270821562332, "learning_rate": 4.846286710155539e-07, "loss": -0.0, "num_tokens": 95507354.0, "reward": 0.765625, "reward_std": 0.13664263486862183, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 600 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.95873080567915e-10, "advantages/std": 0.46759098768234253, "advantages/var": 0.2186413317617486, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.567769477054429, "grad_norm": 0.2593795240903901, "learning_rate": 4.832925114961628e-07, "loss": -0.0, "num_tokens": 95676481.0, "reward": 0.63671875, "reward_std": 0.10429336875677109, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 601 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.882008025557002e-09, "advantages/std": 0.6185697317123413, "advantages/var": 0.3826285129906779, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 2.5720384204909283, "grad_norm": 0.338763944385851, "learning_rate": 4.819564714126818e-07, "loss": 0.0, "num_tokens": 95840225.0, "reward": 0.6015625, "reward_std": 0.18819957971572876, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4905354380607605, "step": 602 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983547925259405e-09, "advantages/std": 0.4675844609737396, "advantages/var": 0.21863522814410263, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.576307363927428, "grad_norm": 0.22735309130132134, "learning_rate": 4.806205603159882e-07, "loss": 0.0, "num_tokens": 96008068.0, "reward": 0.6875, "reward_std": 0.0974610224366188, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 603 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987569352881188e-09, "advantages/std": 0.4675987958908081, "advantages/var": 0.21864863391853362, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.5805763073639274, "grad_norm": 0.23837085253396123, "learning_rate": 4.792847877560366e-07, "loss": -0.0, "num_tokens": 96176960.0, "reward": 0.55859375, "reward_std": 0.11112815886735916, "rewards/drgrpo_math_reward/mean": 0.55859375, "rewards/drgrpo_math_reward/std": 0.4975275993347168, "step": 604 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.882008932297716e-09, "advantages/std": 0.6185694336891174, "advantages/var": 0.38262814429447545, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.584845250800427, "grad_norm": 0.3473234539247001, "learning_rate": 4.779491632817911e-07, "loss": 0.0, "num_tokens": 96317343.0, "reward": 0.73828125, "reward_std": 0.18766915798187256, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 605 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.687352269762781e-09, "advantages/std": 0.5960652232170105, "advantages/var": 0.35529375032874455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.5891141942369265, "grad_norm": 0.29438078792503247, "learning_rate": 4.766136964411575e-07, "loss": 0.0, "num_tokens": 96467809.0, "reward": 0.69921875, "reward_std": 0.16978827118873596, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 606 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6722398485864743e-09, "advantages/std": 0.5227763652801514, "advantages/var": 0.27329512809552625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.5933831376734258, "grad_norm": 0.2350516882093116, "learning_rate": 4.752783967809146e-07, "loss": 0.0, "num_tokens": 96612952.0, "reward": 0.765625, "reward_std": 0.12244509160518646, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 607 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.991740829431179e-09, "advantages/std": 0.4675922393798828, "advantages/var": 0.21864250232829363, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.597652081109925, "grad_norm": 0.2820015262201967, "learning_rate": 4.7394327384664647e-07, "loss": -0.0, "num_tokens": 96769729.0, "reward": 0.6484375, "reward_std": 0.10429581999778748, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 608 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.408338946629123e-09, "advantages/std": 0.33064574003219604, "advantages/var": 0.10932660540143857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.601921024546425, "grad_norm": 0.15699078968341892, "learning_rate": 4.7260833718267303e-07, "loss": -0.0, "num_tokens": 96905296.0, "reward": 0.69921875, "reward_std": 0.05786130577325821, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 609 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2524545316557682e-09, "advantages/std": 0.5726890563964844, "advantages/var": 0.32797275531629566, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 2.606189967982924, "grad_norm": 0.2880655388696752, "learning_rate": 4.712735963319833e-07, "loss": -0.0, "num_tokens": 97084796.0, "reward": 0.5703125, "reward_std": 0.16557452082633972, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4960011839866638, "step": 610 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.906074942309521e-09, "advantages/std": 0.5960731506347656, "advantages/var": 0.355303200907656, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 2.610458911419424, "grad_norm": 0.33861166709668683, "learning_rate": 4.699390608361665e-07, "loss": 0.0, "num_tokens": 97262848.0, "reward": 0.6640625, "reward_std": 0.17950759828090668, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 611 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.547755075542108e-09, "advantages/std": 0.5483195185661316, "advantages/var": 0.3006542944405943, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.614727854855923, "grad_norm": 0.2364570063084319, "learning_rate": 4.686047402353433e-07, "loss": 0.0, "num_tokens": 97422059.0, "reward": 0.81640625, "reward_std": 0.16477571427822113, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 612 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.511102309549375e-09, "advantages/std": 0.4959711730480194, "advantages/var": 0.24598740449462841, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.6189967982924225, "grad_norm": 0.2514100284273653, "learning_rate": 4.672706440680988e-07, "loss": -0.0, "num_tokens": 97570673.0, "reward": 0.78125, "reward_std": 0.133487269282341, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 613 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.1991450029823e-09, "advantages/std": 0.4049604833126068, "advantages/var": 0.1639929930447801, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.6232657417289222, "grad_norm": 0.22105331477881468, "learning_rate": 4.6593678187141296e-07, "loss": -0.0, "num_tokens": 97730199.0, "reward": 0.57421875, "reward_std": 0.09020812809467316, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 614 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4083475927177635e-09, "advantages/std": 0.4959655702114105, "advantages/var": 0.24598184683512958, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.6275346851654215, "grad_norm": 0.2662951442220497, "learning_rate": 4.6460316318059394e-07, "loss": -0.0, "num_tokens": 97874707.0, "reward": 0.7421875, "reward_std": 0.12730157375335693, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 615 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983392291580966e-09, "advantages/std": 0.4676027297973633, "advantages/var": 0.21865231291394593, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.6318036286019213, "grad_norm": 0.2441469783117011, "learning_rate": 4.63269797529209e-07, "loss": 0.0, "num_tokens": 98028818.0, "reward": 0.71875, "reward_std": 0.1145455539226532, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 616 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9916976690634064e-09, "advantages/std": 0.46760237216949463, "advantages/var": 0.21865197845853857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.6360725720384206, "grad_norm": 0.23198391252921965, "learning_rate": 4.619366944490157e-07, "loss": 0.0, "num_tokens": 98180489.0, "reward": 0.73828125, "reward_std": 0.1140151396393776, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 617 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917366402546925e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.64034151547492, "grad_norm": 0.2740599648998935, "learning_rate": 4.60603863469896e-07, "loss": 0.0, "num_tokens": 98342280.0, "reward": 0.734375, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 618 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.032780062477822e-09, "advantages/std": 0.5726901888847351, "advantages/var": 0.3279740524448336, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.644610458911419, "grad_norm": 0.29072671404773964, "learning_rate": 4.592713141197853e-07, "loss": -0.0, "num_tokens": 98500354.0, "reward": 0.75390625, "reward_std": 0.16898700594902039, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 619 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.49255107598981e-10, "advantages/std": 0.5483173131942749, "advantages/var": 0.30065187594858855, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.648879402347919, "grad_norm": 0.2904695592467313, "learning_rate": 4.5793905592460655e-07, "loss": 0.0, "num_tokens": 98656811.0, "reward": 0.78515625, "reward_std": 0.16306579113006592, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 620 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.520222407296989e-09, "advantages/std": 0.5483109354972839, "advantages/var": 0.30064488198590666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 2.6531483457844183, "grad_norm": 0.2600164832503189, "learning_rate": 4.566070984082013e-07, "loss": 0.0, "num_tokens": 98802760.0, "reward": 0.78125, "reward_std": 0.15676140785217285, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 621 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.408326774808376e-09, "advantages/std": 0.4959729015827179, "advantages/var": 0.24598911910438037, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.657417289220918, "grad_norm": 0.21213210045500452, "learning_rate": 4.5527545109226116e-07, "loss": 0.0, "num_tokens": 98945541.0, "reward": 0.71875, "reward_std": 0.134136363863945, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 622 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.336089152630887e-09, "advantages/std": 0.522788405418396, "advantages/var": 0.2733077168399092, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.6616862326574173, "grad_norm": 0.24553847372710627, "learning_rate": 4.5394412349626086e-07, "loss": 0.0, "num_tokens": 99089384.0, "reward": 0.74609375, "reward_std": 0.13664019107818604, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 623 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.898971776345557e-09, "advantages/std": 0.5227907299995422, "advantages/var": 0.27331014737345427, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.6659551760939166, "grad_norm": 0.253230056323244, "learning_rate": 4.5261312513738915e-07, "loss": 0.0, "num_tokens": 99238259.0, "reward": 0.77734375, "reward_std": 0.13835011422634125, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 624 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.547766430613214e-09, "advantages/std": 0.5483170747756958, "advantages/var": 0.300651614490576, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.6702241195304164, "grad_norm": 0.28244036585218013, "learning_rate": 4.5128246553048127e-07, "loss": -0.0, "num_tokens": 99411206.0, "reward": 0.61328125, "reward_std": 0.1629495769739151, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 625 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.3472868915768388e-09, "advantages/std": 0.4959569275379181, "advantages/var": 0.24597327397285174, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.6744930629669157, "grad_norm": 0.2155500491493238, "learning_rate": 4.499521541879508e-07, "loss": 0.0, "num_tokens": 99567544.0, "reward": 0.68359375, "reward_std": 0.11875930428504944, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 626 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.755631042461607e-09, "advantages/std": 0.49596062302589417, "advantages/var": 0.2459769395922331, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 2.678762006403415, "grad_norm": 0.23411680945362368, "learning_rate": 4.486222006197219e-07, "loss": 0.0, "num_tokens": 99728810.0, "reward": 0.6875, "reward_std": 0.12217670679092407, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 627 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.117555573805927e-09, "advantages/std": 0.522786021232605, "advantages/var": 0.2733052239962177, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.6830309498399147, "grad_norm": 0.30473616588248703, "learning_rate": 4.472926143331611e-07, "loss": -0.0, "num_tokens": 99893410.0, "reward": 0.62109375, "reward_std": 0.1332252472639084, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 628 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.972431973776251e-09, "advantages/std": 0.5483101010322571, "advantages/var": 0.30064396689400397, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.687299893276414, "grad_norm": 0.2796635033475051, "learning_rate": 4.459634048330088e-07, "loss": 0.0, "num_tokens": 100042004.0, "reward": 0.7421875, "reward_std": 0.15558435022830963, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 629 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.323229080974593e-10, "advantages/std": 0.43738609552383423, "advantages/var": 0.19130659655758464, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 2.6915688367129134, "grad_norm": 0.25731141790066964, "learning_rate": 4.4463458162131293e-07, "loss": 0.0, "num_tokens": 100195781.0, "reward": 0.6875, "reward_std": 0.08588206768035889, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 630 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.8778354937898005e-09, "advantages/std": 0.49595534801483154, "advantages/var": 0.24597170722451267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.695837780149413, "grad_norm": 0.23915462953716904, "learning_rate": 4.43306154197359e-07, "loss": -0.0, "num_tokens": 100357115.0, "reward": 0.64453125, "reward_std": 0.11652141809463501, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 631 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.8748187096312636e-09, "advantages/std": 0.4049483835697174, "advantages/var": 0.16398319335572697, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 2.7001067235859124, "grad_norm": 0.22316868876816248, "learning_rate": 4.4197813205760363e-07, "loss": -0.0, "num_tokens": 100518712.0, "reward": 0.66796875, "reward_std": 0.07995839416980743, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 632 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907147278172826e-10, "advantages/std": 0.5227950811386108, "advantages/var": 0.2733146968627267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.704375667022412, "grad_norm": 0.2671767594650501, "learning_rate": 4.4065052469560634e-07, "loss": -0.0, "num_tokens": 100663102.0, "reward": 0.765625, "reward_std": 0.1429445743560791, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 633 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.8895334850316917e-09, "advantages/std": 0.36966368556022644, "advantages/var": 0.13665124042196997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 2.7086446104589115, "grad_norm": 0.20871265591814175, "learning_rate": 4.3932334160196105e-07, "loss": -0.0, "num_tokens": 100826005.0, "reward": 0.59765625, "reward_std": 0.06549245119094849, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4913311004638672, "step": 634 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.726209577886217e-09, "advantages/std": 0.43739205598831177, "advantages/var": 0.19131181064168246, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.712913553895411, "grad_norm": 0.2626117494758664, "learning_rate": 4.3799659226422934e-07, "loss": -0.0, "num_tokens": 100961517.0, "reward": 0.80078125, "reward_std": 0.09153735637664795, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 635 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.7249169957935134e-09, "advantages/std": 0.4049423336982727, "advantages/var": 0.16397829362100325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.71718249733191, "grad_norm": 0.16295120586308373, "learning_rate": 4.3667028616687156e-07, "loss": 0.0, "num_tokens": 101131306.0, "reward": 0.64453125, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 636 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.7555197873748836e-09, "advantages/std": 0.4959753155708313, "advantages/var": 0.2459915136555857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 2.72145144076841, "grad_norm": 0.22277699164789738, "learning_rate": 4.3534443279117966e-07, "loss": -0.0, "num_tokens": 101306566.0, "reward": 0.5625, "reward_std": 0.13584628701210022, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49705013632774353, "step": 637 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726798176765442, "advantages/var": 0.3279621735740399, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.725720384204909, "grad_norm": 0.27680391145365696, "learning_rate": 4.3401904161520943e-07, "loss": 0.0, "num_tokens": 101465332.0, "reward": 0.71875, "reward_std": 0.15585274994373322, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 638 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.6590501926319976e-09, "advantages/std": 0.5726829767227173, "advantages/var": 0.32796579182799235, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.729989327641409, "grad_norm": 0.295537748982643, "learning_rate": 4.3269412211371207e-07, "loss": 0.0, "num_tokens": 101617777.0, "reward": 0.63671875, "reward_std": 0.15927013754844666, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 639 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.47316264651047e-09, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.734258271077908, "grad_norm": 0.2862836629920742, "learning_rate": 4.3136968375806764e-07, "loss": 0.0, "num_tokens": 101763561.0, "reward": 0.78515625, "reward_std": 0.10376539826393127, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 640 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.2523525863272304e-09, "advantages/std": 0.2863534986972809, "advantages/var": 0.08199832621617364, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.7385272145144075, "grad_norm": 0.14058634688523225, "learning_rate": 4.3004573601621576e-07, "loss": -0.0, "num_tokens": 101889882.0, "reward": 0.8046875, "reward_std": 0.0468127615749836, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 641 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.49596306681632996, "advantages/var": 0.24597936364585937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.7427961579509073, "grad_norm": 0.2910648666966365, "learning_rate": 4.287222883525896e-07, "loss": 0.0, "num_tokens": 102029621.0, "reward": 0.8046875, "reward_std": 0.12388662248849869, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 642 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.472025841892847e-09, "advantages/std": 0.5727017521858215, "advantages/var": 0.32798729695671014, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.7470651013874066, "grad_norm": 0.31586403545400615, "learning_rate": 4.2739935022804753e-07, "loss": 0.0, "num_tokens": 102187460.0, "reward": 0.71875, "reward_std": 0.18253791332244873, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 643 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.8121162940244e-09, "advantages/std": 0.596075713634491, "advantages/var": 0.3553062563848677, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.7513340448239063, "grad_norm": 0.297871265469956, "learning_rate": 4.260769310998043e-07, "loss": 0.0, "num_tokens": 102336263.0, "reward": 0.75, "reward_std": 0.1822783350944519, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 644 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.5208473665728075e-09, "advantages/std": 0.33064574003219604, "advantages/var": 0.10932660540143857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.7556029882604056, "grad_norm": 0.15027070320158128, "learning_rate": 4.247550404213661e-07, "loss": -0.0, "num_tokens": 102491662.0, "reward": 0.76171875, "reward_std": 0.05786130577325821, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 645 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562930811330029e-09, "advantages/std": 0.5227845311164856, "advantages/var": 0.2733036659746837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.759871931696905, "grad_norm": 0.27059176426281184, "learning_rate": 4.2343368764245994e-07, "loss": -0.0, "num_tokens": 102637046.0, "reward": 0.81640625, "reward_std": 0.13098736107349396, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 646 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344371843712344e-09, "advantages/std": 0.5227869153022766, "advantages/var": 0.27330615881126974, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.7641408751334042, "grad_norm": 0.25141226205888784, "learning_rate": 4.221128822089687e-07, "loss": 0.0, "num_tokens": 102788775.0, "reward": 0.86328125, "reward_std": 0.1344023048877716, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.34422317147254944, "step": 647 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.258416134281467e-09, "advantages/std": 0.43740326166152954, "advantages/var": 0.19132161331214448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 2.768409818569904, "grad_norm": 0.20481933865285598, "learning_rate": 4.207926335628617e-07, "loss": -0.0, "num_tokens": 102947269.0, "reward": 0.71484375, "reward_std": 0.10178709030151367, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 648 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0111997838878266e-09, "advantages/std": 0.6185724139213562, "advantages/var": 0.3826318312644936, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.7726787620064033, "grad_norm": 0.31884596247892505, "learning_rate": 4.1947295114212847e-07, "loss": 0.0, "num_tokens": 103114323.0, "reward": 0.71875, "reward_std": 0.1910865604877472, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 649 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.5049693187805794e-09, "advantages/std": 0.5726837515830994, "advantages/var": 0.32796667932729306, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.776947705442903, "grad_norm": 0.2868042474367118, "learning_rate": 4.1815384438071086e-07, "loss": 0.0, "num_tokens": 103281877.0, "reward": 0.66796875, "reward_std": 0.16044721007347107, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 650 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983533706996105e-09, "advantages/std": 0.46758612990379333, "advantages/var": 0.2186367888784071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.7812166488794023, "grad_norm": 0.2693976494278633, "learning_rate": 4.1683532270843495e-07, "loss": -0.0, "num_tokens": 103432427.0, "reward": 0.7109375, "reward_std": 0.09969891607761383, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 651 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439349784680999e-09, "advantages/std": 0.5726869702339172, "advantages/var": 0.3279703658757036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.7854855923159016, "grad_norm": 0.3233577993325011, "learning_rate": 4.155173955509449e-07, "loss": -0.0, "num_tokens": 103589790.0, "reward": 0.6328125, "reward_std": 0.1638646125793457, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 652 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0646176705942432e-09, "advantages/std": 0.43739765882492065, "advantages/var": 0.1913167119455217, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.7897545357524014, "grad_norm": 0.1948590790756499, "learning_rate": 4.1420007232963435e-07, "loss": 0.0, "num_tokens": 103738326.0, "reward": 0.80078125, "reward_std": 0.09666222333908081, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 653 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.4854458187541657e-09, "advantages/std": 0.46760573983192444, "advantages/var": 0.2186551279237614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.7940234791889007, "grad_norm": 0.22912033472464882, "learning_rate": 4.1288336246157996e-07, "loss": 0.0, "num_tokens": 103899619.0, "reward": 0.73046875, "reward_std": 0.11849091947078705, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 654 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.9794387150541465e-09, "advantages/std": 0.46758410334587097, "advantages/var": 0.21863489370176215, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.7982924226254005, "grad_norm": 0.32781694472570583, "learning_rate": 4.1156727535947383e-07, "loss": -0.0, "num_tokens": 104021397.0, "reward": 0.84765625, "reward_std": 0.0969306081533432, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.3600577116012573, "step": 655 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.63477036056223e-09, "advantages/std": 0.6185793280601501, "advantages/var": 0.38264038510334686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.8025613660618998, "grad_norm": 0.39977204866692856, "learning_rate": 4.1025182043155545e-07, "loss": -0.0, "num_tokens": 104164304.0, "reward": 0.76171875, "reward_std": 0.1996288150548935, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 656 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.389194959261648e-10, "advantages/std": 0.4959544241428375, "advantages/var": 0.24597079082685358, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 2.806830309498399, "grad_norm": 0.2942172566228977, "learning_rate": 4.089370070815462e-07, "loss": -0.0, "num_tokens": 104312308.0, "reward": 0.73828125, "reward_std": 0.11534436047077179, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 657 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 1.383982320429446e-08, "advantages/std": 0.43740418553352356, "advantages/var": 0.1913224215222451, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.8110992529348984, "grad_norm": 0.19964071208461728, "learning_rate": 4.0762284470857995e-07, "loss": 0.0, "num_tokens": 104457041.0, "reward": 0.75390625, "reward_std": 0.10125912725925446, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 658 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.9060479917862134e-10, "advantages/std": 0.5960772633552551, "advantages/var": 0.3553081038890902, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.815368196371398, "grad_norm": 0.32815408933650064, "learning_rate": 4.0630934270713755e-07, "loss": 0.0, "num_tokens": 104613246.0, "reward": 0.7109375, "reward_std": 0.18463245034217834, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 659 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.7342417965515424e-09, "advantages/std": 0.5960754752159119, "advantages/var": 0.35530597215387516, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.8196371398078974, "grad_norm": 0.2767264996596556, "learning_rate": 4.0499651046697946e-07, "loss": -0.0, "num_tokens": 104773937.0, "reward": 0.71484375, "reward_std": 0.1817479282617569, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 660 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.347298598764249e-09, "advantages/std": 0.4959544539451599, "advantages/var": 0.24597082038804174, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 2.823906083244397, "grad_norm": 0.2577226845041563, "learning_rate": 4.036843573730773e-07, "loss": 0.0, "num_tokens": 104928752.0, "reward": 0.71484375, "reward_std": 0.11534436047077179, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 661 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.680367313377009e-09, "advantages/std": 0.5227945446968079, "advantages/var": 0.27331413596474263, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.8281750266808965, "grad_norm": 0.29370182040314274, "learning_rate": 4.0237289280554853e-07, "loss": 0.0, "num_tokens": 105074789.0, "reward": 0.77734375, "reward_std": 0.1422979235649109, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 662 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.489668896152697e-09, "advantages/std": 0.46759358048439026, "advantages/var": 0.21864375651021195, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.832443970117396, "grad_norm": 0.19437045676291703, "learning_rate": 4.0106212613958796e-07, "loss": -0.0, "num_tokens": 105229364.0, "reward": 0.66015625, "reward_std": 0.1060032919049263, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 663 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.6945125683615966e-09, "advantages/std": 0.4959633946418762, "advantages/var": 0.24597968882469345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.8367129135538955, "grad_norm": 0.2647116216149255, "learning_rate": 3.9975206674540196e-07, "loss": -0.0, "num_tokens": 105382844.0, "reward": 0.75390625, "reward_std": 0.12441704422235489, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 664 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6985691161132508e-09, "advantages/std": 0.5482982993125916, "advantages/var": 0.30063102502908023, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.840981856990395, "grad_norm": 0.3388296310037995, "learning_rate": 3.9844272398814026e-07, "loss": 0.0, "num_tokens": 105524271.0, "reward": 0.828125, "reward_std": 0.14085884392261505, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 665 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8182126795653633e-09, "advantages/std": 0.64027339220047, "advantages/var": 0.40995001675989684, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.8452508004268946, "grad_norm": 0.36566203134323744, "learning_rate": 3.9713410722783014e-07, "loss": -0.0, "num_tokens": 105677187.0, "reward": 0.6796875, "reward_std": 0.19135494530200958, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 666 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1232026243611865e-09, "advantages/std": 0.548300564289093, "advantages/var": 0.3006335087997378, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.849519743863394, "grad_norm": 0.3015780894876757, "learning_rate": 3.958262258193088e-07, "loss": 0.0, "num_tokens": 105842642.0, "reward": 0.6875, "reward_std": 0.1442737877368927, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 667 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8819989581979145e-09, "advantages/std": 0.6185727119445801, "advantages/var": 0.38263219996247244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.853788687299893, "grad_norm": 0.4140352972264349, "learning_rate": 3.9451908911215637e-07, "loss": -0.0, "num_tokens": 106019311.0, "reward": 0.61328125, "reward_std": 0.191616952419281, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 668 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.3877060235654596e-09, "advantages/std": 0.43739765882492065, "advantages/var": 0.1913167119455217, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.8580576307363925, "grad_norm": 0.31121658514582756, "learning_rate": 3.932127064506299e-07, "loss": 0.0, "num_tokens": 106166230.0, "reward": 0.76953125, "reward_std": 0.09666221588850021, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 669 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.0779075803624075e-09, "advantages/std": 0.5960719585418701, "advantages/var": 0.35530177975994093, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.8623265741728923, "grad_norm": 0.23713356471530259, "learning_rate": 3.919070871735955e-07, "loss": -0.0, "num_tokens": 106337094.0, "reward": 0.69921875, "reward_std": 0.17938891053199768, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 670 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.547875832215062e-09, "advantages/std": 0.5482935309410095, "advantages/var": 0.30062579607175977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.8665955176093916, "grad_norm": 0.2832364826132542, "learning_rate": 3.906022406144624e-07, "loss": 0.0, "num_tokens": 106479557.0, "reward": 0.76171875, "reward_std": 0.13520357012748718, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 671 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0646322509902576e-09, "advantages/std": 0.4373916685581207, "advantages/var": 0.19131147172405694, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.8708644610458913, "grad_norm": 0.19208483130815907, "learning_rate": 3.8929817610111633e-07, "loss": 0.0, "num_tokens": 106619046.0, "reward": 0.75, "reward_std": 0.09100693464279175, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 672 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.979314307724204e-09, "advantages/std": 0.46759578585624695, "advantages/var": 0.21864581895052115, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.8751334044823906, "grad_norm": 0.23726851480513966, "learning_rate": 3.879949029558515e-07, "loss": -0.0, "num_tokens": 106764635.0, "reward": 0.78125, "reward_std": 0.10718279331922531, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 673 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.3875618921514382e-09, "advantages/std": 0.6185793280601501, "advantages/var": 0.38264038510334686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.87940234791889, "grad_norm": 0.30002245157561425, "learning_rate": 3.866924304953058e-07, "loss": -0.0, "num_tokens": 106923263.0, "reward": 0.68359375, "reward_std": 0.1996288150548935, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 674 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.4157574238941685e-09, "advantages/std": 0.6816369295120239, "advantages/var": 0.4646289036745799, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.8836712913553897, "grad_norm": 0.36167138258359255, "learning_rate": 3.853907680303928e-07, "loss": 0.0, "num_tokens": 107079263.0, "reward": 0.77734375, "reward_std": 0.23619185388088226, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 675 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.694508619074992e-10, "advantages/std": 0.49596381187438965, "advantages/var": 0.24598010268897497, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.887940234791889, "grad_norm": 0.28628604820320996, "learning_rate": 3.840899248662358e-07, "loss": -0.0, "num_tokens": 107221210.0, "reward": 0.7578125, "reward_std": 0.12665247917175293, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 676 }, { "advantages/mean": -5.3551048040390015e-09, "advantages/snr": 1.024351834685619e-08, "advantages/std": 0.5227798223495483, "advantages/var": 0.2732987426558253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.8922091782283887, "grad_norm": 0.26532878266531107, "learning_rate": 3.827899103021016e-07, "loss": 0.0, "num_tokens": 107375048.0, "reward": 0.86328125, "reward_std": 0.1258624941110611, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.34422317147254944, "step": 677 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.258498537262613e-09, "advantages/std": 0.43739479780197144, "advantages/var": 0.19131420914422748, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.896478121664888, "grad_norm": 0.20238707133078684, "learning_rate": 3.814907336313329e-07, "loss": 0.0, "num_tokens": 107512425.0, "reward": 0.80859375, "reward_std": 0.09324727952480316, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 678 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235152379307402e-09, "advantages/std": 0.522782564163208, "advantages/var": 0.2733016093930587, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.9007470651013874, "grad_norm": 0.26416420308482647, "learning_rate": 3.801924041412833e-07, "loss": 0.0, "num_tokens": 107662320.0, "reward": 0.765625, "reward_std": 0.12980784475803375, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 679 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.138353280650014e-08, "advantages/std": 0.5726919770240784, "advantages/var": 0.3279761005477475, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.9050160085378867, "grad_norm": 0.2978908074569031, "learning_rate": 3.788949311132497e-07, "loss": 0.0, "num_tokens": 107823472.0, "reward": 0.71875, "reward_std": 0.16846150159835815, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 680 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.493814952589826e-09, "advantages/std": 0.4675893187522888, "advantages/var": 0.21863977101122956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.9092849519743864, "grad_norm": 0.2536239393904906, "learning_rate": 3.77598323822407e-07, "loss": 0.0, "num_tokens": 107976960.0, "reward": 0.69921875, "reward_std": 0.10205547511577606, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 681 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.855440276329792e-09, "advantages/std": 0.4373944401741028, "advantages/var": 0.19131389629521678, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 2.9135538954108857, "grad_norm": 0.27394085605791346, "learning_rate": 3.763025915377402e-07, "loss": 0.0, "num_tokens": 108121599.0, "reward": 0.75, "reward_std": 0.09271685779094696, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 682 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 9.389061808844325e-09, "advantages/std": 0.495961457490921, "advantages/var": 0.24597776731651866, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.9178228388473855, "grad_norm": 0.21000631620580057, "learning_rate": 3.750077435219806e-07, "loss": 0.0, "num_tokens": 108270769.0, "reward": 0.7578125, "reward_std": 0.12164874374866486, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 683 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.246344777610733e-10, "advantages/std": 0.5483083724975586, "advantages/var": 0.30064207135092147, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 2.9220917822838848, "grad_norm": 0.3039058762004899, "learning_rate": 3.7371378903153743e-07, "loss": 0.0, "num_tokens": 108419518.0, "reward": 0.75390625, "reward_std": 0.1528160572052002, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 684 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.129257973375593e-09, "advantages/std": 0.4373930096626282, "advantages/var": 0.19131264490173194, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.926360725720384, "grad_norm": 0.199255740500126, "learning_rate": 3.724207373164321e-07, "loss": 0.0, "num_tokens": 108573111.0, "reward": 0.67578125, "reward_std": 0.09100939333438873, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 685 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5968822224932214e-09, "advantages/std": 0.43740978837013245, "advantages/var": 0.19132732296200405, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.930629669156884, "grad_norm": 0.20397965382033725, "learning_rate": 3.7112859762023305e-07, "loss": 0.0, "num_tokens": 108718246.0, "reward": 0.69140625, "reward_std": 0.10638399422168732, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 686 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.485497578361386e-09, "advantages/std": 0.4675987958908081, "advantages/var": 0.21864863391853362, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.934898612593383, "grad_norm": 0.27493274840434406, "learning_rate": 3.698373791799885e-07, "loss": -0.0, "num_tokens": 108886492.0, "reward": 0.65234375, "reward_std": 0.11112816631793976, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 687 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6985266479578181e-09, "advantages/std": 0.5483120083808899, "advantages/var": 0.30064605853468507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.9391675560298824, "grad_norm": 0.297621247232595, "learning_rate": 3.6854709122616143e-07, "loss": 0.0, "num_tokens": 109066092.0, "reward": 0.66015625, "reward_std": 0.15676385164260864, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 688 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721981084975215e-09, "advantages/std": 0.5227845311164856, "advantages/var": 0.2733036659746837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.943436499466382, "grad_norm": 0.2978856950784898, "learning_rate": 3.6725774298256286e-07, "loss": 0.0, "num_tokens": 109232524.0, "reward": 0.75390625, "reward_std": 0.13098736107349396, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 689 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516804898616483e-09, "advantages/std": 0.6185716986656189, "advantages/var": 0.3826309463900692, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.9477054429028815, "grad_norm": 0.3153950668234978, "learning_rate": 3.6596934366628584e-07, "loss": 0.0, "num_tokens": 109377958.0, "reward": 0.6484375, "reward_std": 0.18990948796272278, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 690 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907156417836012e-10, "advantages/std": 0.5227945446968079, "advantages/var": 0.27331413596474263, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.951974386339381, "grad_norm": 0.23800738969758298, "learning_rate": 3.646819024876406e-07, "loss": 0.0, "num_tokens": 109519667.0, "reward": 0.67578125, "reward_std": 0.1422979235649109, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 691 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.874775760889987e-09, "advantages/std": 0.4049544334411621, "advantages/var": 0.1639880931636526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.9562433297758806, "grad_norm": 0.21111161810104676, "learning_rate": 3.633954286500872e-07, "loss": -0.0, "num_tokens": 109663347.0, "reward": 0.76171875, "reward_std": 0.0850832611322403, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 692 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4393556240477054e-09, "advantages/std": 0.5726855993270874, "advantages/var": 0.3279687956766253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.96051227321238, "grad_norm": 0.2781378250410254, "learning_rate": 3.621099313501711e-07, "loss": 0.0, "num_tokens": 109815831.0, "reward": 0.671875, "reward_std": 0.16162671148777008, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 693 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.0646405206460833e-09, "advantages/std": 0.43738827109336853, "advantages/var": 0.19130849969004604, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.9647812166488796, "grad_norm": 0.21914340104481092, "learning_rate": 3.608254197774567e-07, "loss": 0.0, "num_tokens": 109967825.0, "reward": 0.66015625, "reward_std": 0.08865037560462952, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 694 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.3230332244416e-09, "advantages/std": 0.4374021887779236, "advantages/var": 0.1913206747477183, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.969050160085379, "grad_norm": 0.22185614338112256, "learning_rate": 3.5954190311446144e-07, "loss": 0.0, "num_tokens": 110116126.0, "reward": 0.73828125, "reward_std": 0.10061003267765045, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 695 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 9.049287811310864e-09, "advantages/std": 0.437395840883255, "advantages/var": 0.19131512162196973, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.9733191035218782, "grad_norm": 0.225793553689538, "learning_rate": 3.582593905365912e-07, "loss": 0.0, "num_tokens": 110265773.0, "reward": 0.67578125, "reward_std": 0.09442433714866638, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 696 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.519352591236486e-09, "advantages/std": 0.3696674108505249, "advantages/var": 0.13665399464493078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.9775880469583775, "grad_norm": 0.20542793794206402, "learning_rate": 3.5697789121207295e-07, "loss": -0.0, "num_tokens": 110394807.0, "reward": 0.8671875, "reward_std": 0.0677327960729599, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3400367796421051, "step": 697 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131143099359766e-10, "advantages/std": 0.5726885795593262, "advantages/var": 0.32797220915767866, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.9818569903948773, "grad_norm": 0.28886301623000166, "learning_rate": 3.556974143018916e-07, "loss": -0.0, "num_tokens": 110549186.0, "reward": 0.69921875, "reward_std": 0.16663289070129395, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 698 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.694515671377163e-10, "advantages/std": 0.49596306681632996, "advantages/var": 0.24597936364585937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 2.9861259338313766, "grad_norm": 0.24207518599909547, "learning_rate": 3.54417968959722e-07, "loss": 0.0, "num_tokens": 110707900.0, "reward": 0.6328125, "reward_std": 0.12388662248849869, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 699 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.129219287094493e-09, "advantages/std": 0.6185618042945862, "advantages/var": 0.38261870573217394, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.9903948772678763, "grad_norm": 0.28677140760275627, "learning_rate": 3.531395643318653e-07, "loss": 0.0, "num_tokens": 110869282.0, "reward": 0.6953125, "reward_std": 0.17794983088970184, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 700 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.75558162001709e-09, "advantages/std": 0.49596714973449707, "advantages/var": 0.24598341361576104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.9946638207043756, "grad_norm": 0.25190631294689303, "learning_rate": 3.5186220955718303e-07, "loss": -0.0, "num_tokens": 111025469.0, "reward": 0.6953125, "reward_std": 0.12953945994377136, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 701 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175257168546057e-09, "advantages/std": 0.5227910280227661, "advantages/var": 0.2733104589811006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.998932764140875, "grad_norm": 0.29505803290443655, "learning_rate": 3.505859137670313e-07, "loss": 0.0, "num_tokens": 111186315.0, "reward": 0.5546875, "reward_std": 0.13888052105903625, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49797385931015015, "step": 702 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.258565854740685e-09, "advantages/std": 0.4373878836631775, "advantages/var": 0.1913081607753533, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.0042689434364993, "grad_norm": 0.24085320145014932, "learning_rate": 3.493106860851962e-07, "loss": 0.0, "num_tokens": 111307504.0, "reward": 0.8046875, "reward_std": 0.08811995387077332, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 703 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721548464238397e-09, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 3.008537886872999, "grad_norm": 0.2488154060969329, "learning_rate": 3.4803653562782804e-07, "loss": -0.0, "num_tokens": 111482548.0, "reward": 0.62890625, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 704 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.9724216339261817e-09, "advantages/std": 0.5483120083808899, "advantages/var": 0.30064605853468507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.0128068303094984, "grad_norm": 0.2594574520332894, "learning_rate": 3.467634715033767e-07, "loss": 0.0, "num_tokens": 111628416.0, "reward": 0.76171875, "reward_std": 0.15676386654376984, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 705 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.3971448795952543e-09, "advantages/std": 0.5482972264289856, "advantages/var": 0.3006298485097183, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 3.0170757737459977, "grad_norm": 0.25315906824862405, "learning_rate": 3.454915028125263e-07, "loss": -0.0, "num_tokens": 111792422.0, "reward": 0.71484375, "reward_std": 0.14085638523101807, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 706 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.749631495215814e-10, "advantages/std": 0.40494880080223083, "advantages/var": 0.16398353127116483, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.0213447171824974, "grad_norm": 0.21177046794302226, "learning_rate": 3.442206386481297e-07, "loss": -0.0, "num_tokens": 111941877.0, "reward": 0.703125, "reward_std": 0.08048881590366364, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 707 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344384639658041e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.0256136606189967, "grad_norm": 0.3119065153498931, "learning_rate": 3.429508880951444e-07, "loss": -0.0, "num_tokens": 112094054.0, "reward": 0.7421875, "reward_std": 0.13269482553005219, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 708 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0646084583111024e-09, "advantages/std": 0.4374014437198639, "advantages/var": 0.19132002296822126, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.0298826040554965, "grad_norm": 0.23739593038026452, "learning_rate": 3.4168226023056636e-07, "loss": 0.0, "num_tokens": 112239948.0, "reward": 0.81640625, "reward_std": 0.09954920411109924, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 709 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 7.468917193812455e-09, "advantages/std": 0.46759918332099915, "advantages/var": 0.21864899624246537, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.034151547491996, "grad_norm": 0.24161547263417743, "learning_rate": 3.404147641233667e-07, "loss": -0.0, "num_tokens": 112390687.0, "reward": 0.75, "reward_std": 0.11165857315063477, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 710 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.5628885646149507e-09, "advantages/std": 0.5227907299995422, "advantages/var": 0.27331014737345427, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.038420490928495, "grad_norm": 0.3094102816726311, "learning_rate": 3.391484088344256e-07, "loss": -0.0, "num_tokens": 112550104.0, "reward": 0.71484375, "reward_std": 0.13835011422634125, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 711 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.2727170154969998e-09, "advantages/std": 0.6402862668037415, "advantages/var": 0.409966503457472, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.042689434364995, "grad_norm": 0.3603208332066757, "learning_rate": 3.378832034164676e-07, "loss": -0.0, "num_tokens": 112710103.0, "reward": 0.63671875, "reward_std": 0.20779281854629517, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 712 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 7.041808204615512e-09, "advantages/std": 0.49596062302589417, "advantages/var": 0.2459769395922331, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 3.046958377801494, "grad_norm": 0.24706872570747945, "learning_rate": 3.366191569139981e-07, "loss": -0.0, "num_tokens": 112867011.0, "reward": 0.796875, "reward_std": 0.12217669934034348, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 713 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4938024801910196e-09, "advantages/std": 0.4675932228565216, "advantages/var": 0.21864342206134868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.0512273212379935, "grad_norm": 0.21995529779889317, "learning_rate": 3.3535627836323674e-07, "loss": -0.0, "num_tokens": 113021282.0, "reward": 0.75, "reward_std": 0.1054728776216507, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 714 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.906124938468067e-09, "advantages/std": 0.5960655212402344, "advantages/var": 0.3552941056113923, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.055496264674493, "grad_norm": 0.34320023408995987, "learning_rate": 3.3409457679205466e-07, "loss": -0.0, "num_tokens": 113172698.0, "reward": 0.671875, "reward_std": 0.17031869292259216, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 715 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196787006259086e-09, "advantages/std": 0.572685182094574, "advantages/var": 0.32796831779069535, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.0597652081109925, "grad_norm": 0.2931749535923372, "learning_rate": 3.328340612199091e-07, "loss": -0.0, "num_tokens": 113331565.0, "reward": 0.70703125, "reward_std": 0.1626850962638855, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 716 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.2775343281201946e-08, "advantages/std": 0.4374000132083893, "advantages/var": 0.19131877155469912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.064034151547492, "grad_norm": 0.19166950434503208, "learning_rate": 3.3157474065777867e-07, "loss": -0.0, "num_tokens": 113483236.0, "reward": 0.7890625, "reward_std": 0.09784172475337982, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 717 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599778655449369e-09, "advantages/std": 0.4049423336982727, "advantages/var": 0.16397829362100325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.0683030949839916, "grad_norm": 0.22559544144317178, "learning_rate": 3.3031662410809955e-07, "loss": 0.0, "num_tokens": 113628029.0, "reward": 0.72265625, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 718 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.4496892771862277e-09, "advantages/std": 0.4049593210220337, "advantages/var": 0.16399205168262654, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.072572038420491, "grad_norm": 0.20541126227691692, "learning_rate": 3.290597205647008e-07, "loss": 0.0, "num_tokens": 113768903.0, "reward": 0.73828125, "reward_std": 0.08903107047080994, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 719 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.899019811574457e-09, "advantages/std": 0.5227856040000916, "advantages/var": 0.27330478774974054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 3.07684098185699, "grad_norm": 0.22644770218390384, "learning_rate": 3.278040390127402e-07, "loss": -0.0, "num_tokens": 113930172.0, "reward": 0.71875, "reward_std": 0.13098980486392975, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 720 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.492689555221465e-09, "advantages/std": 0.5483083724975586, "advantages/var": 0.30064207135092147, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.08110992529349, "grad_norm": 0.3173400630165274, "learning_rate": 3.2654958842863966e-07, "loss": 0.0, "num_tokens": 114077777.0, "reward": 0.73828125, "reward_std": 0.1528160572052002, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 721 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954297384457748e-08, "advantages/std": 0.4676040709018707, "advantages/var": 0.21865356712400175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.0853788687299892, "grad_norm": 0.3356049531649926, "learning_rate": 3.252963777800217e-07, "loss": 0.0, "num_tokens": 114224987.0, "reward": 0.71484375, "reward_std": 0.11625301837921143, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 722 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.855416338439584e-09, "advantages/std": 0.43739622831344604, "advantages/var": 0.19131546054282822, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.089647812166489, "grad_norm": 0.2177588528741852, "learning_rate": 3.2404441602564505e-07, "loss": 0.0, "num_tokens": 114374152.0, "reward": 0.6796875, "reward_std": 0.09495474398136139, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 723 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.749467742471436e-10, "advantages/std": 0.4049603343009949, "advantages/var": 0.16399287235717352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.0939167556029883, "grad_norm": 0.20650317992058692, "learning_rate": 3.2279371211533975e-07, "loss": -0.0, "num_tokens": 114523324.0, "reward": 0.80859375, "reward_std": 0.08850310742855072, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 724 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.1492366929915195e-09, "advantages/std": 0.36966201663017273, "advantages/var": 0.1366500065390861, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.0981856990394876, "grad_norm": 0.1744781666559507, "learning_rate": 3.2154427498994514e-07, "loss": 0.0, "num_tokens": 114658106.0, "reward": 0.8359375, "reward_std": 0.06378498673439026, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.3710577189922333, "step": 725 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.21884497705121e-09, "advantages/std": 0.5483039021492004, "advantages/var": 0.30063716911203997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.1024546424759873, "grad_norm": 0.30287765048437704, "learning_rate": 3.2029611358124365e-07, "loss": -0.0, "num_tokens": 114802608.0, "reward": 0.69921875, "reward_std": 0.14769119024276733, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 726 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.8990304241357955e-09, "advantages/std": 0.5227844715118408, "advantages/var": 0.2733036036539147, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.1067235859124867, "grad_norm": 0.24978831635545135, "learning_rate": 3.190492368118988e-07, "loss": 0.0, "num_tokens": 114947279.0, "reward": 0.80078125, "reward_std": 0.13098734617233276, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 727 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.2186833239762965e-09, "advantages/std": 0.5483161807060242, "advantages/var": 0.30065063402404135, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 3.110992529348986, "grad_norm": 0.25997521211411223, "learning_rate": 3.1780365359539043e-07, "loss": 0.0, "num_tokens": 115098361.0, "reward": 0.71875, "reward_std": 0.1613583266735077, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 728 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 7.452348067104926e-09, "advantages/std": 0.43739622831344604, "advantages/var": 0.19131546054282822, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.1152614727854857, "grad_norm": 0.27132323916482515, "learning_rate": 3.1655937283595113e-07, "loss": 0.0, "num_tokens": 115227478.0, "reward": 0.8828125, "reward_std": 0.09495474398136139, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.3222736418247223, "step": 729 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.878690429522677e-09, "advantages/std": 0.5726880431175232, "advantages/var": 0.3279715947297781, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.119530416221985, "grad_norm": 0.3134701671286318, "learning_rate": 3.153164034285031e-07, "loss": -0.0, "num_tokens": 115370940.0, "reward": 0.76171875, "reward_std": 0.16557206213474274, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 730 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.1938662863450107e-09, "advantages/std": 0.437395840883255, "advantages/var": 0.19131512162196973, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.1237993596584843, "grad_norm": 0.23771120017791628, "learning_rate": 3.1407475425859343e-07, "loss": 0.0, "num_tokens": 115526803.0, "reward": 0.76171875, "reward_std": 0.09442433714866638, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 731 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.2596721319098194e-09, "advantages/std": 0.3696686327457428, "advantages/var": 0.13665489803610686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.128068303094984, "grad_norm": 0.20839087665717526, "learning_rate": 3.128344342023319e-07, "loss": -0.0, "num_tokens": 115665824.0, "reward": 0.8203125, "reward_std": 0.06890984624624252, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 732 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.9724655787857923e-09, "advantages/std": 0.5483039021492004, "advantages/var": 0.30063716911203997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.1323372465314834, "grad_norm": 0.2824516520486795, "learning_rate": 3.1159545212632695e-07, "loss": 0.0, "num_tokens": 115838105.0, "reward": 0.72265625, "reward_std": 0.14769119024276733, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 733 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.131212494768329e-10, "advantages/std": 0.5726836919784546, "advantages/var": 0.32796661105807345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.136606189967983, "grad_norm": 0.3025831327566476, "learning_rate": 3.1035781688762176e-07, "loss": 0.0, "num_tokens": 115996428.0, "reward": 0.67578125, "reward_std": 0.15874217450618744, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 734 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.528048060890719e-10, "advantages/std": 0.6185684204101562, "advantages/var": 0.3826268907287158, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.1408751334044824, "grad_norm": 0.3102665039315969, "learning_rate": 3.09121537333632e-07, "loss": -0.0, "num_tokens": 116142635.0, "reward": 0.78125, "reward_std": 0.18596167862415314, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 735 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.492925903541175e-10, "advantages/std": 0.5482931137084961, "advantages/var": 0.3006253385401578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.1451440768409817, "grad_norm": 0.26113775100574044, "learning_rate": 3.0788662230208145e-07, "loss": -0.0, "num_tokens": 116289720.0, "reward": 0.75, "reward_std": 0.1362619400024414, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 736 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.749545597910266e-10, "advantages/std": 0.40495485067367554, "advantages/var": 0.16398843108413885, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.1494130202774815, "grad_norm": 0.2541314504440522, "learning_rate": 3.0665308062094017e-07, "loss": 0.0, "num_tokens": 116418280.0, "reward": 0.828125, "reward_std": 0.0856136754155159, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 737 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.8459080821278322e-09, "advantages/std": 0.5726869702339172, "advantages/var": 0.3279703658757036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 3.153681963713981, "grad_norm": 0.2974488892903314, "learning_rate": 3.054209211083599e-07, "loss": -0.0, "num_tokens": 116585887.0, "reward": 0.59375, "reward_std": 0.1638646125793457, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.49209436774253845, "step": 738 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.323004209884659e-10, "advantages/std": 0.4374045729637146, "advantages/var": 0.19132276044956953, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 3.15795090715048, "grad_norm": 0.20160205677745852, "learning_rate": 3.0419015257261195e-07, "loss": 0.0, "num_tokens": 116749373.0, "reward": 0.6484375, "reward_std": 0.10178954154253006, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 739 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5624240399785312e-09, "advantages/std": 0.5960754156112671, "advantages/var": 0.3553059010961448, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 3.16221985058698, "grad_norm": 0.2871646775874437, "learning_rate": 3.029607838120246e-07, "loss": -0.0, "num_tokens": 116912138.0, "reward": 0.65234375, "reward_std": 0.1817479282617569, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 740 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0646167275973057e-09, "advantages/std": 0.4373980462551117, "advantages/var": 0.19131705086778883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.166488794023479, "grad_norm": 0.24069845260442374, "learning_rate": 3.017328236149186e-07, "loss": 0.0, "num_tokens": 117058546.0, "reward": 0.7578125, "reward_std": 0.09719263762235641, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 741 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.562928780215032e-09, "advantages/std": 0.5227848291397095, "advantages/var": 0.2733039775786352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 3.1707577374599785, "grad_norm": 0.24511272617353355, "learning_rate": 3.005062807595464e-07, "loss": -0.0, "num_tokens": 117227763.0, "reward": 0.5625, "reward_std": 0.13151776790618896, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49705013632774353, "step": 742 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4959576427936554, "advantages/var": 0.24597398344543908, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.175026680896478, "grad_norm": 0.28007502980297416, "learning_rate": 2.9928116401402745e-07, "loss": 0.0, "num_tokens": 117382433.0, "reward": 0.69140625, "reward_std": 0.11982014030218124, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 743 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.098285603281836e-09, "advantages/std": 0.5726953148841858, "advantages/var": 0.3279799236902967, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 3.1792956243329775, "grad_norm": 0.22358854618839655, "learning_rate": 2.980574821362872e-07, "loss": -0.0, "num_tokens": 117555197.0, "reward": 0.65625, "reward_std": 0.17399811744689941, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 744 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.3472014185425037e-09, "advantages/std": 0.49597498774528503, "advantages/var": 0.24599118846893564, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.1835645677694773, "grad_norm": 0.2605663955901431, "learning_rate": 2.9683524387399353e-07, "loss": -0.0, "num_tokens": 117697821.0, "reward": 0.79296875, "reward_std": 0.13531586527824402, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 745 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 7.81241861977572e-09, "advantages/std": 0.5960526466369629, "advantages/var": 0.35527875756292815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.1878335112059766, "grad_norm": 0.32213306719526563, "learning_rate": 2.9561445796449414e-07, "loss": 0.0, "num_tokens": 117847690.0, "reward": 0.734375, "reward_std": 0.15388324856758118, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 746 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.511355967409318e-09, "advantages/std": 0.4959544241428375, "advantages/var": 0.24597079082685358, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.192102454642476, "grad_norm": 0.33015216030836414, "learning_rate": 2.943951331347546e-07, "loss": -0.0, "num_tokens": 118006200.0, "reward": 0.76953125, "reward_std": 0.11534436047077179, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 747 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.3443054276452664e-09, "advantages/std": 0.5227934122085571, "advantages/var": 0.27331295184866633, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.1963713980789756, "grad_norm": 0.272846026500217, "learning_rate": 2.931772781012958e-07, "loss": -0.0, "num_tokens": 118145012.0, "reward": 0.65625, "reward_std": 0.1422954797744751, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 748 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983451446179363e-09, "advantages/std": 0.46759578585624695, "advantages/var": 0.21864581895052115, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.200640341515475, "grad_norm": 0.2411495898833993, "learning_rate": 2.9196090157013143e-07, "loss": 0.0, "num_tokens": 118292956.0, "reward": 0.640625, "reward_std": 0.10718280076980591, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 749 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.694635563755988e-10, "advantages/std": 0.4959504008293152, "advantages/var": 0.2459668000827584, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 3.2049092849519742, "grad_norm": 0.282352941450176, "learning_rate": 2.9074601223670613e-07, "loss": 0.0, "num_tokens": 118459099.0, "reward": 0.70703125, "reward_std": 0.11139655113220215, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 750 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.794128748648032e-09, "advantages/std": 0.5483102202415466, "advantages/var": 0.3006440976213334, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.209178228388474, "grad_norm": 0.277899846738354, "learning_rate": 2.895326187858326e-07, "loss": 0.0, "num_tokens": 118591536.0, "reward": 0.7734375, "reward_std": 0.1539955735206604, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 751 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.694570397962185e-10, "advantages/std": 0.49595728516578674, "advantages/var": 0.2459736287090175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.2134471718249733, "grad_norm": 0.2643694557106249, "learning_rate": 2.883207298916304e-07, "loss": 0.0, "num_tokens": 118744790.0, "reward": 0.7421875, "reward_std": 0.11928972601890564, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 752 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.818215037122586e-09, "advantages/std": 0.6402806043624878, "advantages/var": 0.4099592523227926, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.2177161152614726, "grad_norm": 0.3697522220779177, "learning_rate": 2.8711035421746363e-07, "loss": 0.0, "num_tokens": 118901732.0, "reward": 0.74609375, "reward_std": 0.2009580433368683, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 753 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628515995610897e-09, "advantages/std": 0.5227961540222168, "advantages/var": 0.27331581866042143, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.2219850586979724, "grad_norm": 0.26758234939193326, "learning_rate": 2.8590150041587886e-07, "loss": -0.0, "num_tokens": 119045722.0, "reward": 0.67578125, "reward_std": 0.1429470181465149, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 754 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.661586974765028e-09, "advantages/std": 0.43739062547683716, "advantages/var": 0.19131055925501883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.2262540021344717, "grad_norm": 0.28207170515823043, "learning_rate": 2.846941771285428e-07, "loss": 0.0, "num_tokens": 119187816.0, "reward": 0.84375, "reward_std": 0.08982987701892853, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3638034462928772, "step": 755 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 6.9115327786001205e-09, "advantages/std": 0.5726835131645203, "advantages/var": 0.32796640625045725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.2305229455709714, "grad_norm": 0.36454511361568076, "learning_rate": 2.8348839298618177e-07, "loss": 0.0, "num_tokens": 119341187.0, "reward": 0.7734375, "reward_std": 0.15991678833961487, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 756 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.131367227291202e-10, "advantages/std": 0.28633639216423035, "advantages/var": 0.08198852947762791, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.2347918890074707, "grad_norm": 0.14542160724387604, "learning_rate": 2.8228415660851916e-07, "loss": -0.0, "num_tokens": 119482536.0, "reward": 0.8125, "reward_std": 0.036563023924827576, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.3910769522190094, "step": 757 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.389108072544253e-10, "advantages/std": 0.49595901370048523, "advantages/var": 0.2459753432707581, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.23906083244397, "grad_norm": 0.2659098513284101, "learning_rate": 2.810814766042132e-07, "loss": -0.0, "num_tokens": 119648386.0, "reward": 0.7109375, "reward_std": 0.11993882060050964, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 758 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.7556100546892774e-09, "advantages/std": 0.4959633946418762, "advantages/var": 0.24597968882469345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.24332977588047, "grad_norm": 0.22029640670047446, "learning_rate": 2.7988036157079753e-07, "loss": 0.0, "num_tokens": 119801114.0, "reward": 0.80078125, "reward_std": 0.12441704422235489, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 759 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975227058317357e-09, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.247598719316969, "grad_norm": 0.2659706965042973, "learning_rate": 2.78680820094617e-07, "loss": 0.0, "num_tokens": 119936957.0, "reward": 0.79296875, "reward_std": 0.10376541316509247, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 760 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.6590475268052982e-09, "advantages/std": 0.5726833939552307, "advantages/var": 0.327966269712082, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.2518676627534684, "grad_norm": 0.32066245051467945, "learning_rate": 2.774828607507683e-07, "loss": 0.0, "num_tokens": 120083359.0, "reward": 0.71875, "reward_std": 0.15821176767349243, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 761 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.450217068554151e-09, "advantages/std": 0.4959578514099121, "advantages/var": 0.24597419037513646, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.256136606189968, "grad_norm": 0.2510266146709993, "learning_rate": 2.7628649210303836e-07, "loss": 0.0, "num_tokens": 120232789.0, "reward": 0.77734375, "reward_std": 0.11993636190891266, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 762 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.369569097398126e-09, "advantages/std": 0.5483039021492004, "advantages/var": 0.30063716911203997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.2604055496264674, "grad_norm": 0.28712600517511083, "learning_rate": 2.750917227038418e-07, "loss": -0.0, "num_tokens": 120380304.0, "reward": 0.72265625, "reward_std": 0.14769119024276733, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 763 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 6.6803871160595985e-09, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.2646744930629668, "grad_norm": 0.29156871329647277, "learning_rate": 2.7389856109416175e-07, "loss": -0.0, "num_tokens": 120542612.0, "reward": 0.57421875, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 764 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.749631495215814e-10, "advantages/std": 0.40494880080223083, "advantages/var": 0.16398353127116483, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.2689434364994665, "grad_norm": 0.16744214090624027, "learning_rate": 2.7270701580348734e-07, "loss": -0.0, "num_tokens": 120679485.0, "reward": 0.765625, "reward_std": 0.08048880845308304, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 765 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.0646255047872349e-09, "advantages/std": 0.4373944401741028, "advantages/var": 0.19131389629521678, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 3.273212379935966, "grad_norm": 0.2318518047895217, "learning_rate": 2.715170953497532e-07, "loss": 0.0, "num_tokens": 120831937.0, "reward": 0.6796875, "reward_std": 0.09271685779094696, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 766 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.218814372272699e-09, "advantages/std": 0.5483062267303467, "advantages/var": 0.30063971827127034, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.2774813233724656, "grad_norm": 0.328974605528526, "learning_rate": 2.7032880823927906e-07, "loss": 0.0, "num_tokens": 120981368.0, "reward": 0.72265625, "reward_std": 0.1511061191558838, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 767 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878730035740397e-09, "advantages/std": 0.5726833939552307, "advantages/var": 0.327966269712082, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.281750266808965, "grad_norm": 0.42304270998951904, "learning_rate": 2.691421629667076e-07, "loss": 0.0, "num_tokens": 121125897.0, "reward": 0.609375, "reward_std": 0.15821176767349243, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 768 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.041734719091418e-10, "advantages/std": 0.3306438624858856, "advantages/var": 0.10932536379958524, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.286019210245464, "grad_norm": 0.1430120963726623, "learning_rate": 2.6795716801494534e-07, "loss": 0.0, "num_tokens": 121266307.0, "reward": 0.8125, "reward_std": 0.056153833866119385, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.3910769522190094, "step": 769 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.323008924728638e-09, "advantages/std": 0.43740418553352356, "advantages/var": 0.1913224215222451, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 3.2902881536819635, "grad_norm": 0.22800294676904456, "learning_rate": 2.667738318551005e-07, "loss": 0.0, "num_tokens": 121416178.0, "reward": 0.70703125, "reward_std": 0.10125912725925446, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 770 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966747009302438e-09, "advantages/std": 0.46760493516921997, "advantages/var": 0.2186543753946104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.2945570971184632, "grad_norm": 0.25481125832933416, "learning_rate": 2.655921629464245e-07, "loss": 0.0, "num_tokens": 121565746.0, "reward": 0.71484375, "reward_std": 0.1157250627875328, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 771 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.572310092063941e-09, "advantages/std": 0.4959639608860016, "advantages/var": 0.2459802504977313, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.2988260405549625, "grad_norm": 0.28682597279632915, "learning_rate": 2.644121697362485e-07, "loss": 0.0, "num_tokens": 121704270.0, "reward": 0.8046875, "reward_std": 0.1250636875629425, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 772 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6985414192492779e-09, "advantages/std": 0.5483072400093079, "advantages/var": 0.30064082944662474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 3.3030949839914623, "grad_norm": 0.27403367824418345, "learning_rate": 2.6323386065992594e-07, "loss": -0.0, "num_tokens": 121880668.0, "reward": 0.640625, "reward_std": 0.15110857784748077, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 773 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.3073639274279616, "grad_norm": 0.2682281308298368, "learning_rate": 2.6205724414077064e-07, "loss": 0.0, "num_tokens": 122029949.0, "reward": 0.73828125, "reward_std": 0.10376539826393127, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 774 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755680466206912e-09, "advantages/std": 0.49595409631729126, "advantages/var": 0.24597046565390102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.311632870864461, "grad_norm": 0.2743161534441316, "learning_rate": 2.608823285899964e-07, "loss": 0.0, "num_tokens": 122181927.0, "reward": 0.7265625, "reward_std": 0.11481395363807678, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 775 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 6.102934182783377e-09, "advantages/std": 0.4959578812122345, "advantages/var": 0.2459742199365289, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 3.3159018143009606, "grad_norm": 0.2027708527903687, "learning_rate": 2.597091224066581e-07, "loss": 0.0, "num_tokens": 122343058.0, "reward": 0.72265625, "reward_std": 0.11993636190891266, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 776 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.3230332244416e-09, "advantages/std": 0.4374021887779236, "advantages/var": 0.1913206747477183, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 3.32017075773746, "grad_norm": 0.21503882009414887, "learning_rate": 2.5853763397759077e-07, "loss": -0.0, "num_tokens": 122500411.0, "reward": 0.70703125, "reward_std": 0.10061003267765045, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 777 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917184873602147e-09, "advantages/std": 0.46759748458862305, "advantages/var": 0.21864740759360757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.3244397011739593, "grad_norm": 0.2098579693727438, "learning_rate": 2.573678716773496e-07, "loss": 0.0, "num_tokens": 122650780.0, "reward": 0.75, "reward_std": 0.10942068696022034, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 778 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.7248786584154183e-09, "advantages/std": 0.4049513339996338, "advantages/var": 0.16398558290808296, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.328708644610459, "grad_norm": 0.21299236537132937, "learning_rate": 2.561998438681507e-07, "loss": 0.0, "num_tokens": 122800404.0, "reward": 0.76953125, "reward_std": 0.08166831731796265, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 779 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.694570397962185e-09, "advantages/std": 0.49595728516578674, "advantages/var": 0.2459736287090175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.3329775880469583, "grad_norm": 0.3391401964898629, "learning_rate": 2.5503355889981026e-07, "loss": 0.0, "num_tokens": 122946483.0, "reward": 0.6640625, "reward_std": 0.11928971856832504, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 780 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.226802052403756e-09, "advantages/std": 0.5227915048599243, "advantages/var": 0.27331095755370427, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 3.3372465314834576, "grad_norm": 0.24102544569783185, "learning_rate": 2.538690251096862e-07, "loss": -0.0, "num_tokens": 123100248.0, "reward": 0.71484375, "reward_std": 0.13782215118408203, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 781 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.7248446385931138e-09, "advantages/std": 0.4049593210220337, "advantages/var": 0.16399205168262654, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 3.3415154749199574, "grad_norm": 0.20175673231542038, "learning_rate": 2.5270625082261753e-07, "loss": 0.0, "num_tokens": 123242511.0, "reward": 0.76171875, "reward_std": 0.08903107047080994, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 782 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.726138489896817e-09, "advantages/std": 0.4374004006385803, "advantages/var": 0.19131911047879058, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.3457844183564567, "grad_norm": 0.2967863305336702, "learning_rate": 2.5154524435086535e-07, "loss": 0.0, "num_tokens": 123380139.0, "reward": 0.74609375, "reward_std": 0.09837214648723602, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 783 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.1125084199436845e-09, "advantages/std": 0.33064574003219604, "advantages/var": 0.10932660540143857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.3500533617929564, "grad_norm": 0.23352820446542386, "learning_rate": 2.5038601399405335e-07, "loss": -0.0, "num_tokens": 123518056.0, "reward": 0.80859375, "reward_std": 0.05786130577325821, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 784 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755677983704659e-09, "advantages/std": 0.4959544241428375, "advantages/var": 0.24597079082685358, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.3543223052294557, "grad_norm": 0.23057733809022715, "learning_rate": 2.4922856803910784e-07, "loss": 0.0, "num_tokens": 123653611.0, "reward": 0.77734375, "reward_std": 0.11534436047077179, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 785 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1232070097423174e-09, "advantages/std": 0.5482994318008423, "advantages/var": 0.3006322669131265, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.358591248665955, "grad_norm": 0.28881729463753514, "learning_rate": 2.480729147601999e-07, "loss": -0.0, "num_tokens": 123798879.0, "reward": 0.72265625, "reward_std": 0.14256630837917328, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 786 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.983395592005843e-09, "advantages/std": 0.46760234236717224, "advantages/var": 0.21865195058726616, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.362860192102455, "grad_norm": 0.2043066381743843, "learning_rate": 2.469190624186847e-07, "loss": -0.0, "num_tokens": 123962329.0, "reward": 0.58203125, "reward_std": 0.114015132188797, "rewards/drgrpo_math_reward/mean": 0.58203125, "rewards/drgrpo_math_reward/std": 0.49419113993644714, "step": 787 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.8778354937898005e-09, "advantages/std": 0.49595534801483154, "advantages/var": 0.24597170722451267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.367129135538954, "grad_norm": 0.2664637882228327, "learning_rate": 2.4576701926304354e-07, "loss": 0.0, "num_tokens": 124110579.0, "reward": 0.70703125, "reward_std": 0.11652141809463501, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 788 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975093007190219e-09, "advantages/std": 0.46760237216949463, "advantages/var": 0.21865197845853857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.3713980789754534, "grad_norm": 0.238654574305711, "learning_rate": 2.446167935288244e-07, "loss": 0.0, "num_tokens": 124262473.0, "reward": 0.66015625, "reward_std": 0.11401514708995819, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 789 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.898855043494538e-09, "advantages/std": 0.5228031873703003, "advantages/var": 0.2733231727245453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 3.375667022411953, "grad_norm": 0.24661389558231522, "learning_rate": 2.434683934385833e-07, "loss": 0.0, "num_tokens": 124424840.0, "reward": 0.62890625, "reward_std": 0.14978180825710297, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 790 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.9724474836862365e-09, "advantages/std": 0.5483072400093079, "advantages/var": 0.30064082944662474, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 3.3799359658484525, "grad_norm": 0.27253745908815463, "learning_rate": 2.423218272018252e-07, "loss": 0.0, "num_tokens": 124591001.0, "reward": 0.6328125, "reward_std": 0.15110859274864197, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 791 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 6.91155579785241e-09, "advantages/std": 0.5726816058158875, "advantages/var": 0.3279642216398635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.3842049092849518, "grad_norm": 0.25802938171721934, "learning_rate": 2.411771030149453e-07, "loss": 0.0, "num_tokens": 124751630.0, "reward": 0.73046875, "reward_std": 0.15703225135803223, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 792 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.816686215012817e-09, "advantages/std": 0.49596714973449707, "advantages/var": 0.24598341361576104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.3884738527214515, "grad_norm": 0.27168022769566497, "learning_rate": 2.400342290611709e-07, "loss": -0.0, "num_tokens": 124900592.0, "reward": 0.7109375, "reward_std": 0.12953945994377136, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 793 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.962714342249698e-09, "advantages/std": 0.46759846806526184, "advantages/var": 0.2186483273369797, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.392742796157951, "grad_norm": 0.2667009318967389, "learning_rate": 2.3889321351050284e-07, "loss": 0.0, "num_tokens": 125039515.0, "reward": 0.8671875, "reward_std": 0.11059774458408356, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3400367796421051, "step": 794 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987538125611118e-09, "advantages/std": 0.4676036834716797, "advantages/var": 0.2186532047962828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.3970117395944506, "grad_norm": 0.23550138678115673, "learning_rate": 2.3775406451965645e-07, "loss": -0.0, "num_tokens": 125195086.0, "reward": 0.703125, "reward_std": 0.11572261154651642, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 795 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878660979162264e-09, "advantages/std": 0.5726915001869202, "advantages/var": 0.3279755543863452, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.40128068303095, "grad_norm": 0.2735380653658283, "learning_rate": 2.3661679023200422e-07, "loss": -0.0, "num_tokens": 125345445.0, "reward": 0.69140625, "reward_std": 0.16951988637447357, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 796 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 8.464761490163722e-09, "advantages/std": 0.4675998091697693, "advantages/var": 0.21864958153560465, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 3.405549626467449, "grad_norm": 0.23813199755629091, "learning_rate": 2.3548139877751627e-07, "loss": 0.0, "num_tokens": 125493833.0, "reward": 0.73828125, "reward_std": 0.11230521649122238, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 797 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.1125556422290086e-09, "advantages/std": 0.3306383490562439, "advantages/var": 0.10932171786663858, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.409818569903949, "grad_norm": 0.12153519634202017, "learning_rate": 2.343478982727039e-07, "loss": 0.0, "num_tokens": 125645869.0, "reward": 0.66015625, "reward_std": 0.05273643881082535, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 798 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.3696071806564656e-09, "advantages/std": 0.5483006238937378, "advantages/var": 0.3006335741622621, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.4140875133404482, "grad_norm": 0.30323601151678475, "learning_rate": 2.332162968205598e-07, "loss": 0.0, "num_tokens": 125793341.0, "reward": 0.75, "reward_std": 0.1442737877368927, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 799 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.4497956537828906e-09, "advantages/std": 0.40494683384895325, "advantages/var": 0.16398193824429175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.4183564567769475, "grad_norm": 0.19049755713492447, "learning_rate": 2.3208660251050156e-07, "loss": 0.0, "num_tokens": 125927288.0, "reward": 0.828125, "reward_std": 0.07825092226266861, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 800 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.4855000219859937e-09, "advantages/std": 0.46759846806526184, "advantages/var": 0.2186483273369797, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.4226254002134473, "grad_norm": 0.27154904727883944, "learning_rate": 2.309588234183137e-07, "loss": -0.0, "num_tokens": 126065545.0, "reward": 0.7734375, "reward_std": 0.11059774458408356, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 801 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726763010025024, "advantages/var": 0.3279581457299088, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.4268943436499466, "grad_norm": 0.2711538515302693, "learning_rate": 2.298329676060884e-07, "loss": 0.0, "num_tokens": 126232628.0, "reward": 0.609375, "reward_std": 0.15190494060516357, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48884621262550354, "step": 802 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.816749178431964e-09, "advantages/std": 0.49595606327056885, "advantages/var": 0.2459724166948405, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.431163287086446, "grad_norm": 0.19758971646506324, "learning_rate": 2.2870904312217003e-07, "loss": -0.0, "num_tokens": 126384634.0, "reward": 0.85546875, "reward_std": 0.11758224666118622, "rewards/drgrpo_math_reward/mean": 0.85546875, "rewards/drgrpo_math_reward/std": 0.35231640934944153, "step": 803 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.225105741504331e-09, "advantages/std": 0.4959581792354584, "advantages/var": 0.24597451555055105, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.4354322305229457, "grad_norm": 0.2558787584019585, "learning_rate": 2.2758705800109578e-07, "loss": -0.0, "num_tokens": 126536339.0, "reward": 0.6171875, "reward_std": 0.12046678364276886, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 804 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.225068420259267e-09, "advantages/std": 0.49596256017684937, "advantages/var": 0.24597886109717493, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.439701173959445, "grad_norm": 0.2466944973330916, "learning_rate": 2.264670202635396e-07, "loss": -0.0, "num_tokens": 126686026.0, "reward": 0.64453125, "reward_std": 0.1249450072646141, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 805 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.726115133006607e-09, "advantages/std": 0.43740314245224, "advantages/var": 0.19132150902709455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.4439701173959447, "grad_norm": 0.31112164118570995, "learning_rate": 2.2534893791625404e-07, "loss": 0.0, "num_tokens": 126831629.0, "reward": 0.70703125, "reward_std": 0.10008206963539124, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 806 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9917589827329205e-09, "advantages/std": 0.46758797764778137, "advantages/var": 0.2186385168407421, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.448239060832444, "grad_norm": 0.2238639116740361, "learning_rate": 2.2423281895201336e-07, "loss": 0.0, "num_tokens": 126966573.0, "reward": 0.7734375, "reward_std": 0.10034801065921783, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 807 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2196840322654347e-09, "advantages/std": 0.5726826786994934, "advantages/var": 0.3279654504824272, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.4525080042689433, "grad_norm": 0.3077283178476076, "learning_rate": 2.2311867134955636e-07, "loss": -0.0, "num_tokens": 127125366.0, "reward": 0.671875, "reward_std": 0.15873971581459045, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 808 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907367650833425e-10, "advantages/std": 0.5227821469306946, "advantages/var": 0.27330117314946634, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.456776947705443, "grad_norm": 0.30284735739903507, "learning_rate": 2.220065030735288e-07, "loss": -0.0, "num_tokens": 127277936.0, "reward": 0.74609375, "reward_std": 0.1275724172592163, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 809 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6985611762532253e-09, "advantages/std": 0.5483008623123169, "advantages/var": 0.3006338356124303, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.4610458911419424, "grad_norm": 0.2639969304678607, "learning_rate": 2.208963220744276e-07, "loss": -0.0, "num_tokens": 127445671.0, "reward": 0.67578125, "reward_std": 0.1448042094707489, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 810 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.6721447927622405e-09, "advantages/std": 0.5227949619293213, "advantages/var": 0.2733145722186805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 3.4653148345784417, "grad_norm": 0.24547420920224383, "learning_rate": 2.197881362885426e-07, "loss": 0.0, "num_tokens": 127598673.0, "reward": 0.7890625, "reward_std": 0.14123953878879547, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 811 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.51130496365215e-09, "advantages/std": 0.49595779180526733, "advantages/var": 0.2459741312523569, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.4695837780149414, "grad_norm": 0.24518690215315847, "learning_rate": 2.1868195363790143e-07, "loss": 0.0, "num_tokens": 127760070.0, "reward": 0.66796875, "reward_std": 0.11823134124279022, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 812 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5969151899982774e-09, "advantages/std": 0.437400758266449, "advantages/var": 0.19131942333206453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.4738527214514408, "grad_norm": 0.2734723366755448, "learning_rate": 2.175777820302116e-07, "loss": 0.0, "num_tokens": 127916569.0, "reward": 0.6640625, "reward_std": 0.09890256822109222, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 813 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.5969535994024202e-09, "advantages/std": 0.4373902380466461, "advantages/var": 0.19131022033850176, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.47812166488794, "grad_norm": 0.21099785350170763, "learning_rate": 2.1647562935880405e-07, "loss": 0.0, "num_tokens": 128064583.0, "reward": 0.64453125, "reward_std": 0.08929946273565292, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 814 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.816693323644983e-09, "advantages/std": 0.4959658980369568, "advantages/var": 0.24598217201560502, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 3.48239060832444, "grad_norm": 0.2667089119009988, "learning_rate": 2.1537550350257766e-07, "loss": 0.0, "num_tokens": 128222698.0, "reward": 0.68359375, "reward_std": 0.12783199548721313, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 815 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235013757668656e-09, "advantages/std": 0.5227941870689392, "advantages/var": 0.273313762033073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.486659551760939, "grad_norm": 0.25729948182946266, "learning_rate": 2.1427741232594182e-07, "loss": 0.0, "num_tokens": 128365820.0, "reward": 0.796875, "reward_std": 0.1417675018310547, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 816 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.599693686636838e-09, "advantages/std": 0.404949814081192, "advantages/var": 0.16398435192439198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 3.490928495197439, "grad_norm": 0.22707173594327354, "learning_rate": 2.1318136367876093e-07, "loss": 0.0, "num_tokens": 128510234.0, "reward": 0.6640625, "reward_std": 0.07996084541082382, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 817 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 4.408892626470716e-09, "advantages/std": 0.369665265083313, "advantages/var": 0.13665240820911606, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.495197438633938, "grad_norm": 0.22414899065228167, "learning_rate": 2.120873653962983e-07, "loss": 0.0, "num_tokens": 128669693.0, "reward": 0.6171875, "reward_std": 0.06549490243196487, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 818 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.0646075153304846e-09, "advantages/std": 0.43740183115005493, "advantages/var": 0.19132036189342116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.4994663820704375, "grad_norm": 0.2400189816440304, "learning_rate": 2.109954252991595e-07, "loss": 0.0, "num_tokens": 128818258.0, "reward": 0.734375, "reward_std": 0.10007961094379425, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 819 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.596904635896381e-09, "advantages/std": 0.4374036490917206, "advantages/var": 0.19132195223875303, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.503735325506937, "grad_norm": 0.24899442368975186, "learning_rate": 2.0990555119323732e-07, "loss": -0.0, "num_tokens": 128965008.0, "reward": 0.7890625, "reward_std": 0.10231749713420868, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 820 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 8.187712417192517e-09, "advantages/std": 0.3696756958961487, "advantages/var": 0.1366601201363018, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.5080042689434365, "grad_norm": 0.25818591987177536, "learning_rate": 2.0881775086965492e-07, "loss": 0.0, "num_tokens": 129111286.0, "reward": 0.76171875, "reward_std": 0.07456512749195099, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 821 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.2268665404887286e-09, "advantages/std": 0.5227763652801514, "advantages/var": 0.27329512809552625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 3.512273212379936, "grad_norm": 0.3228763186681196, "learning_rate": 2.0773203210471112e-07, "loss": -0.0, "num_tokens": 129254887.0, "reward": 0.8046875, "reward_std": 0.12244509905576706, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 822 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.944862008824502e-09, "advantages/std": 0.5483102798461914, "advantages/var": 0.30064416298500873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.5165421558164356, "grad_norm": 0.31636997771256936, "learning_rate": 2.0664840265982452e-07, "loss": 0.0, "num_tokens": 129417478.0, "reward": 0.6875, "reward_std": 0.1539955586194992, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 823 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.246396939612599e-09, "advantages/std": 0.548301637172699, "advantages/var": 0.30063468532626203, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.520811099252935, "grad_norm": 0.2597116754710221, "learning_rate": 2.0556687028147763e-07, "loss": 0.0, "num_tokens": 129566189.0, "reward": 0.71484375, "reward_std": 0.14427624642848969, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 824 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.7249168688457406e-09, "advantages/std": 0.4049423635005951, "advantages/var": 0.1639783177574481, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.525080042689434, "grad_norm": 0.19985575215620124, "learning_rate": 2.0448744270116203e-07, "loss": 0.0, "num_tokens": 129723689.0, "reward": 0.75390625, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 825 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6985526825316541e-09, "advantages/std": 0.5483036041259766, "advantages/var": 0.3006368422975356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.529348986125934, "grad_norm": 0.2561307664630002, "learning_rate": 2.0341012763532239e-07, "loss": -0.0, "num_tokens": 129876823.0, "reward": 0.703125, "reward_std": 0.14716076850891113, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 826 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.4855557819877025e-09, "advantages/std": 0.46759098768234253, "advantages/var": 0.2186413317617486, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.5336179295624333, "grad_norm": 0.2774758277053324, "learning_rate": 2.0233493278530244e-07, "loss": 0.0, "num_tokens": 130020789.0, "reward": 0.73828125, "reward_std": 0.10429336130619049, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 827 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691728490686944e-09, "advantages/std": 0.572695791721344, "advantages/var": 0.327980469855337, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.537886872998933, "grad_norm": 0.3102557456709466, "learning_rate": 2.0126186583728855e-07, "loss": -0.0, "num_tokens": 130180043.0, "reward": 0.74609375, "reward_std": 0.17464473843574524, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 828 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.764002631405744e-09, "advantages/std": 0.618571937084198, "advantages/var": 0.382631241348097, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.5421558164354323, "grad_norm": 0.3208915683974791, "learning_rate": 2.001909344622559e-07, "loss": 0.0, "num_tokens": 130344774.0, "reward": 0.71484375, "reward_std": 0.19043990969657898, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 829 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9874931899534334e-09, "advantages/std": 0.4676107168197632, "advantages/var": 0.21865978248469276, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.5464247598719316, "grad_norm": 0.2432981280364349, "learning_rate": 1.9912214631591312e-07, "loss": 0.0, "num_tokens": 130477952.0, "reward": 0.84765625, "reward_std": 0.1234995573759079, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.3600577116012573, "step": 830 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.098457610141317e-09, "advantages/std": 0.5726791620254517, "advantages/var": 0.3279614226181735, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.550693703308431, "grad_norm": 0.2968918749530196, "learning_rate": 1.980555090386477e-07, "loss": 0.0, "num_tokens": 130637160.0, "reward": 0.78125, "reward_std": 0.154791921377182, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 831 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.9876278105623057e-09, "advantages/std": 0.4675896465778351, "advantages/var": 0.21864007758678472, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.5549626467449307, "grad_norm": 0.19183500301052084, "learning_rate": 1.9699103025547143e-07, "loss": 0.0, "num_tokens": 130791173.0, "reward": 0.7421875, "reward_std": 0.10258589684963226, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 832 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.49596190452575684, "advantages/var": 0.24597821074081594, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.55923159018143, "grad_norm": 0.24124343899636105, "learning_rate": 1.959287175759653e-07, "loss": 0.0, "num_tokens": 130923799.0, "reward": 0.83203125, "reward_std": 0.1238841712474823, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.3745708465576172, "step": 833 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.907440772282189e-10, "advantages/std": 0.5227778553962708, "advantages/var": 0.2732966860927242, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.5635005336179297, "grad_norm": 0.24767285169176734, "learning_rate": 1.9486857859422607e-07, "loss": -0.0, "num_tokens": 131070507.0, "reward": 0.7265625, "reward_std": 0.12468298524618149, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 834 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.226770571388329e-09, "advantages/std": 0.5227988958358765, "advantages/var": 0.2733186854872116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.567769477054429, "grad_norm": 0.27492333608946673, "learning_rate": 1.938106208888114e-07, "loss": -0.0, "num_tokens": 131231547.0, "reward": 0.671875, "reward_std": 0.14689238369464874, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 835 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131201492978207e-10, "advantages/std": 0.5726844668388367, "advantages/var": 0.3279674985584826, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.5720384204909283, "grad_norm": 0.3346074472616991, "learning_rate": 1.927548520226857e-07, "loss": -0.0, "num_tokens": 131384872.0, "reward": 0.71484375, "reward_std": 0.15991923213005066, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 836 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.920008729382487e-09, "advantages/std": 0.4373980462551117, "advantages/var": 0.19131705086778883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.576307363927428, "grad_norm": 0.21789478595753164, "learning_rate": 1.9170127954316645e-07, "loss": 0.0, "num_tokens": 131535364.0, "reward": 0.71875, "reward_std": 0.09719263017177582, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 837 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.953017746559513e-09, "advantages/std": 0.5960791707038879, "advantages/var": 0.3553103777470348, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.5805763073639274, "grad_norm": 0.2817389613292503, "learning_rate": 1.9064991098186934e-07, "loss": -0.0, "num_tokens": 131700679.0, "reward": 0.640625, "reward_std": 0.18463735282421112, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 838 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 9.447179218732682e-09, "advantages/std": 0.36968278884887695, "advantages/var": 0.13666536437108334, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.584845250800427, "grad_norm": 0.21340090715046417, "learning_rate": 1.8960075385465546e-07, "loss": 0.0, "num_tokens": 131845157.0, "reward": 0.6875, "reward_std": 0.08022041618824005, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 839 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.726129604088858e-09, "advantages/std": 0.4374014437198639, "advantages/var": 0.19132002296822126, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.5891141942369265, "grad_norm": 0.22177987637345595, "learning_rate": 1.8855381566157725e-07, "loss": -0.0, "num_tokens": 132005552.0, "reward": 0.66796875, "reward_std": 0.09954920411109924, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 840 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.749589180951444e-09, "advantages/std": 0.4049517810344696, "advantages/var": 0.16398594496298902, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.5933831376734258, "grad_norm": 0.19025125246218424, "learning_rate": 1.8750910388682427e-07, "loss": -0.0, "num_tokens": 132143786.0, "reward": 0.828125, "reward_std": 0.08219873160123825, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 841 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.1938662863450107e-09, "advantages/std": 0.437395840883255, "advantages/var": 0.19131512162196973, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.597652081109925, "grad_norm": 0.19459458021058607, "learning_rate": 1.8646662599867068e-07, "loss": -0.0, "num_tokens": 132293070.0, "reward": 0.76171875, "reward_std": 0.09442432969808578, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 842 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0646268830368173e-09, "advantages/std": 0.4373938739299774, "advantages/var": 0.19131340095147298, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 3.601921024546425, "grad_norm": 0.2134909524177928, "learning_rate": 1.8542638944942125e-07, "loss": -0.0, "num_tokens": 132454528.0, "reward": 0.66796875, "reward_std": 0.09377524256706238, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 843 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 7.904298133858483e-09, "advantages/std": 0.6185803413391113, "advantages/var": 0.3826416386912115, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.606189967982924, "grad_norm": 0.30309840432087204, "learning_rate": 1.8438840167535824e-07, "loss": 0.0, "num_tokens": 132620507.0, "reward": 0.6953125, "reward_std": 0.20133629441261292, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 844 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.548316478729248, "advantages/var": 0.3006509608460419, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.610458911419424, "grad_norm": 0.2991431432555865, "learning_rate": 1.8335267009668792e-07, "loss": 0.0, "num_tokens": 132778683.0, "reward": 0.69140625, "reward_std": 0.1618887335062027, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 845 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.3472194717341796e-09, "advantages/std": 0.4959711730480194, "advantages/var": 0.24598740449462841, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.614727854855923, "grad_norm": 0.3224258068457957, "learning_rate": 1.8231920211748818e-07, "loss": 0.0, "num_tokens": 132930609.0, "reward": 0.7109375, "reward_std": 0.133487269282341, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 846 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.987660943944554e-09, "advantages/std": 0.4675844609737396, "advantages/var": 0.21863522814410263, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.6189967982924225, "grad_norm": 0.22463884937510695, "learning_rate": 1.812880051256551e-07, "loss": 0.0, "num_tokens": 133084022.0, "reward": 0.75, "reward_std": 0.0974610298871994, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 847 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.1939374485847554e-09, "advantages/std": 0.43738609552383423, "advantages/var": 0.19130659655758464, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.6232657417289222, "grad_norm": 0.24305824007115434, "learning_rate": 1.8025908649285032e-07, "loss": 0.0, "num_tokens": 133223806.0, "reward": 0.703125, "reward_std": 0.08588206768035889, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 848 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.855440276329792e-09, "advantages/std": 0.4373944401741028, "advantages/var": 0.19131389629521678, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.6275346851654215, "grad_norm": 0.2853846954871858, "learning_rate": 1.7923245357444843e-07, "loss": 0.0, "num_tokens": 133358507.0, "reward": 0.7578125, "reward_std": 0.09271685779094696, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 849 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.388932047240732e-10, "advantages/std": 0.4959683120250702, "advantages/var": 0.24598456653299738, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.6318036286019213, "grad_norm": 0.25126654517040875, "learning_rate": 1.7820811370948368e-07, "loss": -0.0, "num_tokens": 133503879.0, "reward": 0.75390625, "reward_std": 0.12954190373420715, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 850 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.2187931845011295e-09, "advantages/std": 0.5483078360557556, "advantages/var": 0.3006414830801454, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 3.6360725720384206, "grad_norm": 0.3011783803267109, "learning_rate": 1.7718607422059879e-07, "loss": -0.0, "num_tokens": 133673186.0, "reward": 0.625, "reward_std": 0.15216940641403198, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4850712716579437, "step": 851 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.7896166871692685e-09, "advantages/std": 0.522797703742981, "advantages/var": 0.2733174390389337, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.64034151547492, "grad_norm": 0.2899610395590994, "learning_rate": 1.7616634241399176e-07, "loss": -0.0, "num_tokens": 133816736.0, "reward": 0.72265625, "reward_std": 0.14518490433692932, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 852 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.0327963533730878e-09, "advantages/std": 0.5726855993270874, "advantages/var": 0.3279687956766253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.644610458911419, "grad_norm": 0.2765547383782989, "learning_rate": 1.7514892557936307e-07, "loss": -0.0, "num_tokens": 133982791.0, "reward": 0.6640625, "reward_std": 0.16162671148777008, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 853 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.987660943944554e-09, "advantages/std": 0.4675844609737396, "advantages/var": 0.21863522814410263, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.648879402347919, "grad_norm": 0.2622846551934317, "learning_rate": 1.741338309898656e-07, "loss": -0.0, "num_tokens": 134133320.0, "reward": 0.6953125, "reward_std": 0.0974610298871994, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 854 }, { "advantages/mean": -5.3551048040390015e-09, "advantages/snr": 8.09801548684253e-09, "advantages/std": 0.6612860560417175, "advantages/var": 0.4372992479152096, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.6531483457844183, "grad_norm": 0.36731981713267514, "learning_rate": 1.7312106590205012e-07, "loss": 0.0, "num_tokens": 134284425.0, "reward": 0.71484375, "reward_std": 0.2217308133840561, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 855 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.492732023090665e-09, "advantages/std": 0.5483056306838989, "advantages/var": 0.30063906463966816, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.657417289220918, "grad_norm": 0.2825978000496106, "learning_rate": 1.7211063755581524e-07, "loss": -0.0, "num_tokens": 134442698.0, "reward": 0.7265625, "reward_std": 0.15045949816703796, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 856 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.691724344030994e-09, "advantages/std": 0.5726962089538574, "advantages/var": 0.3279809477501203, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.6616862326574173, "grad_norm": 0.2549773013386139, "learning_rate": 1.7110255317435502e-07, "loss": 0.0, "num_tokens": 134621058.0, "reward": 0.6171875, "reward_std": 0.17358636856079102, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 857 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.6918558552053295e-09, "advantages/std": 0.5726829767227173, "advantages/var": 0.32796579182799235, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 3.6659551760939166, "grad_norm": 0.36316391513683494, "learning_rate": 1.700968199641069e-07, "loss": 0.0, "num_tokens": 134775005.0, "reward": 0.74609375, "reward_std": 0.15927013754844666, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 858 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 4.408847130130579e-09, "advantages/std": 0.3696690797805786, "advantages/var": 0.1366552285458198, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.6702241195304164, "grad_norm": 0.24753457132012033, "learning_rate": 1.6909344511470114e-07, "loss": 0.0, "num_tokens": 134902976.0, "reward": 0.78515625, "reward_std": 0.06944026052951813, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 859 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.492850196788031e-10, "advantages/std": 0.5482980012893677, "advantages/var": 0.30063069821791544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.6744930629669157, "grad_norm": 0.3315546828284674, "learning_rate": 1.6809243579890865e-07, "loss": 0.0, "num_tokens": 135037019.0, "reward": 0.68359375, "reward_std": 0.14032843708992004, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 860 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.492722790909057e-10, "advantages/std": 0.5483062267303467, "advantages/var": 0.30063971827127034, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.678762006403415, "grad_norm": 0.286009008207847, "learning_rate": 1.6709379917259025e-07, "loss": 0.0, "num_tokens": 135188376.0, "reward": 0.66796875, "reward_std": 0.15110613405704498, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 861 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.9724759189415973e-09, "advantages/std": 0.5483019948005676, "advantages/var": 0.3006350775022817, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.6830309498399147, "grad_norm": 2.4436489261824925, "learning_rate": 1.6609754237464473e-07, "loss": 0.0, "num_tokens": 135341019.0, "reward": 0.734375, "reward_std": 0.14651167392730713, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 862 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.1231809285315785e-09, "advantages/std": 0.5483061671257019, "advantages/var": 0.30063965290807815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.687299893276414, "grad_norm": 0.34011279543517225, "learning_rate": 1.6510367252695878e-07, "loss": -0.0, "num_tokens": 135503240.0, "reward": 0.55859375, "reward_std": 0.15110613405704498, "rewards/drgrpo_math_reward/mean": 0.55859375, "rewards/drgrpo_math_reward/std": 0.4975275993347168, "step": 863 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.1292238802003243e-09, "advantages/std": 0.4374000132083893, "advantages/var": 0.19131877155469912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.6915688367129134, "grad_norm": 0.24761728765685573, "learning_rate": 1.6411219673435563e-07, "loss": 0.0, "num_tokens": 135635341.0, "reward": 0.796875, "reward_std": 0.09784172475337982, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 864 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.11761315668422e-09, "advantages/std": 0.5227763652801514, "advantages/var": 0.27329512809552625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.695837780149413, "grad_norm": 0.36749810582776, "learning_rate": 1.631231220845437e-07, "loss": 0.0, "num_tokens": 135784445.0, "reward": 0.765625, "reward_std": 0.12244509160518646, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 865 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016642767905576e-09, "advantages/std": 0.5227813720703125, "advantages/var": 0.2733003629837185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.7001067235859124, "grad_norm": 0.24979578320151785, "learning_rate": 1.621364556480675e-07, "loss": 0.0, "num_tokens": 135939903.0, "reward": 0.84765625, "reward_std": 0.12810038030147552, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.3600577116012573, "step": 866 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.258521169625515e-09, "advantages/std": 0.4373924732208252, "advantages/var": 0.19131217563023029, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.704375667022412, "grad_norm": 0.2654350357443024, "learning_rate": 1.61152204478255e-07, "loss": -0.0, "num_tokens": 136095139.0, "reward": 0.734375, "reward_std": 0.09206776320934296, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 867 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.694500720521717e-10, "advantages/std": 0.4959646463394165, "advantages/var": 0.2459809304185825, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.7086446104589115, "grad_norm": 0.2536901123402134, "learning_rate": 1.6017037561116897e-07, "loss": 0.0, "num_tokens": 136238980.0, "reward": 0.7421875, "reward_std": 0.1261245161294937, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 868 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175925403475357e-09, "advantages/std": 0.5227798223495483, "advantages/var": 0.2732987426558253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.712913553895411, "grad_norm": 0.24456262286722547, "learning_rate": 1.59190976065556e-07, "loss": 0.0, "num_tokens": 136409795.0, "reward": 0.61328125, "reward_std": 0.1258624941110611, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 869 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.661552881108902e-09, "advantages/std": 0.43739622831344604, "advantages/var": 0.19131546054282822, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 3.71718249733191, "grad_norm": 0.2562752288229779, "learning_rate": 1.5821401284279567e-07, "loss": 0.0, "num_tokens": 136562123.0, "reward": 0.6640625, "reward_std": 0.09495474398136139, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 870 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.9724516843147123e-09, "advantages/std": 0.5483064651489258, "advantages/var": 0.30063997972411016, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.72145144076841, "grad_norm": 0.29765250685263855, "learning_rate": 1.572394929268519e-07, "loss": -0.0, "num_tokens": 136710574.0, "reward": 0.671875, "reward_std": 0.15163654088974, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 871 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.972501446509269e-09, "advantages/std": 0.5482972860336304, "advantages/var": 0.3006299138718447, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.725720384204909, "grad_norm": 0.23036154460610173, "learning_rate": 1.5626742328422194e-07, "loss": -0.0, "num_tokens": 136864200.0, "reward": 0.72265625, "reward_std": 0.14085638523101807, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 872 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 5.163816746546357e-09, "advantages/std": 0.495977520942688, "advantages/var": 0.2459937012804545, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.729989327641409, "grad_norm": 0.22976091980157884, "learning_rate": 1.5529781086388688e-07, "loss": 0.0, "num_tokens": 137014888.0, "reward": 0.77734375, "reward_std": 0.13873080909252167, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 873 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46759456396102905, "advantages/var": 0.2186446762459049, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 3.734258271077908, "grad_norm": 0.2551973930148244, "learning_rate": 1.543306625972623e-07, "loss": -0.0, "num_tokens": 137174278.0, "reward": 0.61328125, "reward_std": 0.10718034207820892, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 874 }, { "advantages/mean": -6.51925802230835e-09, "advantages/snr": 1.0937042648124181e-08, "advantages/std": 0.5960713624954224, "advantages/var": 0.3553010691871492, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.7385272145144075, "grad_norm": 0.2965880638380372, "learning_rate": 1.5336598539814783e-07, "loss": -0.0, "num_tokens": 137310614.0, "reward": 0.80078125, "reward_std": 0.17662307620048523, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 875 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.259734082608678e-09, "advantages/std": 0.3696504533290863, "advantages/var": 0.136641457646399, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.7427961579509073, "grad_norm": 0.21159351586605826, "learning_rate": 1.5240378616267886e-07, "loss": -0.0, "num_tokens": 137445559.0, "reward": 0.74609375, "reward_std": 0.055242717266082764, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 876 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 8.907189930094833e-09, "advantages/std": 0.5227925777435303, "advantages/var": 0.27331207934372515, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 3.7470651013874066, "grad_norm": 0.28854439169914126, "learning_rate": 1.5144407176927647e-07, "loss": -0.0, "num_tokens": 137600127.0, "reward": 0.7109375, "reward_std": 0.14111842215061188, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 877 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.041735353792631e-10, "advantages/std": 0.33064383268356323, "advantages/var": 0.10932534409167616, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.7513340448239063, "grad_norm": 0.15326824587402674, "learning_rate": 1.504868490785987e-07, "loss": 0.0, "num_tokens": 137757873.0, "reward": 0.6484375, "reward_std": 0.056153833866119385, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 878 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.85536048412357e-09, "advantages/std": 0.4374004006385803, "advantages/var": 0.19131911047879058, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.7556029882604056, "grad_norm": 0.2227162347817429, "learning_rate": 1.4953212493349076e-07, "loss": -0.0, "num_tokens": 137899606.0, "reward": 0.73828125, "reward_std": 0.09837213903665543, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 879 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.877790810008545e-09, "advantages/std": 0.49596714973449707, "advantages/var": 0.24598341361576104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.759871931696905, "grad_norm": 0.27176674002899376, "learning_rate": 1.4857990615893718e-07, "loss": -0.0, "num_tokens": 138061119.0, "reward": 0.7265625, "reward_std": 0.12953945994377136, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 880 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.34430603695948e-09, "advantages/std": 0.5227933526039124, "advantages/var": 0.27331288952683863, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.7641408751334042, "grad_norm": 0.26342087753288634, "learning_rate": 1.4763019956201251e-07, "loss": 0.0, "num_tokens": 138207026.0, "reward": 0.7109375, "reward_std": 0.14059044420719147, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 881 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 8.94427137290863e-09, "advantages/std": 0.5726876854896545, "advantages/var": 0.3279711851114975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.768409818569904, "grad_norm": 0.3527159478095035, "learning_rate": 1.4668301193183196e-07, "loss": 0.0, "num_tokens": 138350855.0, "reward": 0.859375, "reward_std": 0.1633366346359253, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3483152687549591, "step": 882 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.40494078397750854, "advantages/var": 0.16397703852831924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.7726787620064033, "grad_norm": 0.22089051678841656, "learning_rate": 1.4573835003950435e-07, "loss": -0.0, "num_tokens": 138499503.0, "reward": 0.75, "reward_std": 0.07312604784965515, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 883 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 5.646024076671005e-09, "advantages/std": 0.6185697317123413, "advantages/var": 0.3826285129906779, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.776947705442903, "grad_norm": 0.33332968074776875, "learning_rate": 1.4479622063808239e-07, "loss": 0.0, "num_tokens": 138659563.0, "reward": 0.6484375, "reward_std": 0.18819957971572876, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 884 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175139874944796e-09, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.7812166488794023, "grad_norm": 0.2913051287443507, "learning_rate": 1.438566304625151e-07, "loss": -0.0, "num_tokens": 138818903.0, "reward": 0.66015625, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 885 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.6945190564897304e-09, "advantages/std": 0.4959627091884613, "advantages/var": 0.24597900890555824, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 3.7854855923159016, "grad_norm": 0.30882217106637433, "learning_rate": 1.429195862295997e-07, "loss": -0.0, "num_tokens": 138970507.0, "reward": 0.62109375, "reward_std": 0.12335620820522308, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 886 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.164009129592525e-09, "advantages/std": 0.4959590435028076, "advantages/var": 0.24597537283221982, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 3.7897545357524014, "grad_norm": 0.23766551699507737, "learning_rate": 1.4198509463793273e-07, "loss": -0.0, "num_tokens": 139131572.0, "reward": 0.6328125, "reward_std": 0.11993881314992905, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 887 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.528024122922025e-10, "advantages/std": 0.6185703873634338, "advantages/var": 0.3826293241229486, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.7940234791889007, "grad_norm": 0.33804980606057317, "learning_rate": 1.410531623678633e-07, "loss": -0.0, "num_tokens": 139303132.0, "reward": 0.5625, "reward_std": 0.18767160177230835, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49705013632774353, "step": 888 }, { "advantages/mean": -6.752088665962219e-09, "advantages/snr": 1.2915415091048558e-08, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.7982924226254005, "grad_norm": 0.33630546338820555, "learning_rate": 1.4012379608144475e-07, "loss": 0.0, "num_tokens": 139445873.0, "reward": 0.76171875, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 889 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262340360707772e-09, "advantages/std": 0.5726866722106934, "advantages/var": 0.32797002452775814, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.8025613660618998, "grad_norm": 0.31992935383858606, "learning_rate": 1.3919700242238712e-07, "loss": 0.0, "num_tokens": 139616331.0, "reward": 0.57421875, "reward_std": 0.1633341759443283, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 890 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.246379398265418e-09, "advantages/std": 0.5483039021492004, "advantages/var": 0.30063716911203997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.806830309498399, "grad_norm": 0.3029778902195819, "learning_rate": 1.3827278801600978e-07, "loss": 0.0, "num_tokens": 139762906.0, "reward": 0.78515625, "reward_std": 0.14769119024276733, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 891 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.3360704161248273e-09, "advantages/std": 0.5227957367897034, "advantages/var": 0.2733153824054888, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.8110992529348984, "grad_norm": 0.24190131077280838, "learning_rate": 1.373511594691934e-07, "loss": -0.0, "num_tokens": 139923060.0, "reward": 0.7265625, "reward_std": 0.14400538802146912, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 892 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4938013376916182e-09, "advantages/std": 0.46759358048439026, "advantages/var": 0.21864375651021195, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.815368196371398, "grad_norm": 0.213665408572706, "learning_rate": 1.3643212337033393e-07, "loss": -0.0, "num_tokens": 140072958.0, "reward": 0.75390625, "reward_std": 0.1060032844543457, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 893 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.2727493096958816e-09, "advantages/std": 0.6402799487113953, "advantages/var": 0.40995841272186695, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.8196371398078974, "grad_norm": 0.29694736667971106, "learning_rate": 1.3551568628929432e-07, "loss": 0.0, "num_tokens": 140234033.0, "reward": 0.67578125, "reward_std": 0.2014860063791275, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 894 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.46761050820350647, "advantages/var": 0.2186595873823416, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.823906083244397, "grad_norm": 0.19275105427008563, "learning_rate": 1.346018547773582e-07, "loss": 0.0, "num_tokens": 140380833.0, "reward": 0.7265625, "reward_std": 0.12138035148382187, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 895 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.659128645842666e-09, "advantages/std": 0.5726706981658936, "advantages/var": 0.32795172853781196, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.8281750266808965, "grad_norm": 0.26645263914606415, "learning_rate": 1.3369063536718344e-07, "loss": 0.0, "num_tokens": 140522260.0, "reward": 0.796875, "reward_std": 0.14454218745231628, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 896 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.991712901920792e-09, "advantages/std": 0.4675987958908081, "advantages/var": 0.21864863391853362, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.832443970117396, "grad_norm": 0.2393996622425807, "learning_rate": 1.3278203457275399e-07, "loss": -0.0, "num_tokens": 140676901.0, "reward": 0.77734375, "reward_std": 0.11112815886735916, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 897 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629629032543266e-09, "advantages/std": 0.5227798223495483, "advantages/var": 0.2732987426558253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.8367129135538955, "grad_norm": 0.24755508461462025, "learning_rate": 1.3187605888933505e-07, "loss": 0.0, "num_tokens": 140835149.0, "reward": 0.70703125, "reward_std": 0.1258624941110611, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 898 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.453673669766467e-09, "advantages/std": 0.5227833390235901, "advantages/var": 0.27330241956065393, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.840981856990395, "grad_norm": 0.2601741221037382, "learning_rate": 1.3097271479342525e-07, "loss": 0.0, "num_tokens": 140996194.0, "reward": 0.6328125, "reward_std": 0.12927988171577454, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 899 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.323127886632988e-09, "advantages/std": 0.4373944103717804, "advantages/var": 0.19131387022447743, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.8452508004268946, "grad_norm": 0.23103748082036624, "learning_rate": 1.3007200874271124e-07, "loss": 0.0, "num_tokens": 141132943.0, "reward": 0.7890625, "reward_std": 0.09271685779094696, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 900 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.633436069767149e-09, "advantages/std": 0.4959615468978882, "advantages/var": 0.24597785600134614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.849519743863394, "grad_norm": 0.23474980568006226, "learning_rate": 1.291739471760212e-07, "loss": 0.0, "num_tokens": 141282228.0, "reward": 0.7734375, "reward_std": 0.1233537495136261, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 901 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2268269331908505e-09, "advantages/std": 0.5227856636047363, "advantages/var": 0.27330485007064453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 3.853788687299893, "grad_norm": 0.25644152549883725, "learning_rate": 1.282785365132788e-07, "loss": -0.0, "num_tokens": 141449187.0, "reward": 0.5390625, "reward_std": 0.13269484043121338, "rewards/drgrpo_math_reward/mean": 0.5390625, "rewards/drgrpo_math_reward/std": 0.4994482398033142, "step": 902 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5960760116577148, "advantages/var": 0.3553066116737682, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.8580576307363925, "grad_norm": 0.3059233842690013, "learning_rate": 1.273857831554575e-07, "loss": -0.0, "num_tokens": 141613248.0, "reward": 0.72265625, "reward_std": 0.1828087568283081, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 903 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4083371836861376e-09, "advantages/std": 0.4959692358970642, "advantages/var": 0.24598548295631772, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.8623265741728923, "grad_norm": 0.26417244640023585, "learning_rate": 1.2649569348453415e-07, "loss": 0.0, "num_tokens": 141786131.0, "reward": 0.63671875, "reward_std": 0.13071897625923157, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 904 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.408326774808376e-09, "advantages/std": 0.4959729015827179, "advantages/var": 0.24598911910438037, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.8665955176093916, "grad_norm": 0.25801166081421384, "learning_rate": 1.256082738634444e-07, "loss": 0.0, "num_tokens": 141934869.0, "reward": 0.6875, "reward_std": 0.134136363863945, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 905 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.117604270298766e-09, "advantages/std": 0.5227778553962708, "advantages/var": 0.2732966860927242, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.8708644610458913, "grad_norm": 0.28898482123519487, "learning_rate": 1.2472353063603623e-07, "loss": 0.0, "num_tokens": 142073269.0, "reward": 0.8515625, "reward_std": 0.12468297779560089, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.3562295734882355, "step": 906 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987508612632739e-09, "advantages/std": 0.4676083028316498, "advantages/var": 0.2186575248770959, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 3.8751334044823906, "grad_norm": 0.15786910018254705, "learning_rate": 1.2384147012702518e-07, "loss": -0.0, "num_tokens": 142230015.0, "reward": 0.75390625, "reward_std": 0.12020084261894226, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 907 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.246344777610733e-10, "advantages/std": 0.5483083724975586, "advantages/var": 0.30064207135092147, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.87940234791889, "grad_norm": 0.29409113629686395, "learning_rate": 1.229620986419494e-07, "loss": 0.0, "num_tokens": 142384322.0, "reward": 0.70703125, "reward_std": 0.1528160572052002, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 908 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 4.024678068029527e-09, "advantages/std": 0.4049552381038666, "advantages/var": 0.16398874486775927, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.8836712913553897, "grad_norm": 0.17047396269876772, "learning_rate": 1.2208542246712344e-07, "loss": 0.0, "num_tokens": 142543955.0, "reward": 0.67578125, "reward_std": 0.0861440896987915, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 909 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6721368717488085e-09, "advantages/std": 0.5227965116500854, "advantages/var": 0.27331619259349793, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.887940234791889, "grad_norm": 0.3010578150999706, "learning_rate": 1.2121144786959464e-07, "loss": 0.0, "num_tokens": 142716353.0, "reward": 0.6953125, "reward_std": 0.1434774398803711, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 910 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.812095983572427e-10, "advantages/std": 0.5960772633552551, "advantages/var": 0.3553081038890902, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.8922091782283887, "grad_norm": 0.28912195447402445, "learning_rate": 1.2034018109709716e-07, "loss": -0.0, "num_tokens": 142861296.0, "reward": 0.78125, "reward_std": 0.18463245034217834, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 911 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.694555446758157e-10, "advantages/std": 0.4959588646888733, "advantages/var": 0.24597519546347613, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.896478121664888, "grad_norm": 0.2527869648597765, "learning_rate": 1.1947162837800838e-07, "loss": 0.0, "num_tokens": 143018752.0, "reward": 0.6171875, "reward_std": 0.12152761220932007, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.48702529072761536, "step": 912 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.991751746761795e-09, "advantages/std": 0.46758967638015747, "advantages/var": 0.2186401054573004, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.9007470651013874, "grad_norm": 0.2158798647469881, "learning_rate": 1.1860579592130365e-07, "loss": 0.0, "num_tokens": 143148036.0, "reward": 0.8359375, "reward_std": 0.10258589684963226, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.3710577189922333, "step": 913 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 1.0615920798986244e-08, "advantages/std": 0.548305332660675, "advantages/var": 0.30063873782413353, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.9050160085378867, "grad_norm": 0.2755074303952438, "learning_rate": 1.1774268991651209e-07, "loss": 0.0, "num_tokens": 143309921.0, "reward": 0.81640625, "reward_std": 0.14992906153202057, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 914 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.8181813666449564e-09, "advantages/std": 0.6402844190597534, "advantages/var": 0.4099641372906859, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.9092849519743864, "grad_norm": 0.3347062868224139, "learning_rate": 1.168823165336727e-07, "loss": -0.0, "num_tokens": 143490099.0, "reward": 0.55859375, "reward_std": 0.20608291029930115, "rewards/drgrpo_math_reward/mean": 0.55859375, "rewards/drgrpo_math_reward/std": 0.4975275993347168, "step": 915 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4083547014131486e-09, "advantages/std": 0.49596306681632996, "advantages/var": 0.24597936364585937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.9135538954108857, "grad_norm": 0.26389252496546856, "learning_rate": 1.1602468192328934e-07, "loss": -0.0, "num_tokens": 143624497.0, "reward": 0.796875, "reward_std": 0.12388662248849869, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 916 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.0646119401001621e-09, "advantages/std": 0.4374000132083893, "advantages/var": 0.19131877155469912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 3.9178228388473855, "grad_norm": 0.1934175385847749, "learning_rate": 1.1516979221628803e-07, "loss": 0.0, "num_tokens": 143787782.0, "reward": 0.75, "reward_std": 0.09784172475337982, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 917 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5726915597915649, "advantages/var": 0.3279756226564956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.9220917822838848, "grad_norm": 0.2799402291849327, "learning_rate": 1.1431765352397166e-07, "loss": 0.0, "num_tokens": 143928082.0, "reward": 0.80859375, "reward_std": 0.16951988637447357, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 918 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.453658436377939e-09, "advantages/std": 0.5227851271629333, "advantages/var": 0.2733042891827644, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.926360725720384, "grad_norm": 0.2900591127626478, "learning_rate": 1.1346827193797797e-07, "loss": -0.0, "num_tokens": 144068591.0, "reward": 0.76171875, "reward_std": 0.13204818964004517, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 919 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1231809285315785e-09, "advantages/std": 0.5483061671257019, "advantages/var": 0.30063965290807815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.930629669156884, "grad_norm": 0.26515987521507567, "learning_rate": 1.1262165353023472e-07, "loss": 0.0, "num_tokens": 144226666.0, "reward": 0.71484375, "reward_std": 0.1511061191558838, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 920 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 5.163944896884701e-09, "advantages/std": 0.49596521258354187, "advantages/var": 0.24598149209303788, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.934898612593383, "grad_norm": 0.23332221023622182, "learning_rate": 1.117778043529164e-07, "loss": -0.0, "num_tokens": 144374744.0, "reward": 0.73828125, "reward_std": 0.12677115201950073, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 921 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 9.15431138117741e-09, "advantages/std": 0.6612836718559265, "advantages/var": 0.4372960946632567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.9391675560298824, "grad_norm": 0.3211763858336168, "learning_rate": 1.1093673043840179e-07, "loss": 0.0, "num_tokens": 144545000.0, "reward": 0.57421875, "reward_std": 0.2205488383769989, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 922 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.225008758413878e-09, "advantages/std": 0.4959695637226105, "advantages/var": 0.24598580813919657, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.943436499466382, "grad_norm": 0.32812558235268086, "learning_rate": 1.1009843779922978e-07, "loss": 0.0, "num_tokens": 144688710.0, "reward": 0.6875, "reward_std": 0.13124938309192657, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 923 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.4855557819877025e-09, "advantages/std": 0.46759098768234253, "advantages/var": 0.2186413317617486, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.9477054429028815, "grad_norm": 0.24068333353653437, "learning_rate": 1.0926293242805735e-07, "loss": 0.0, "num_tokens": 144847224.0, "reward": 0.80859375, "reward_std": 0.10429336875677109, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 924 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.2727621056868287e-09, "advantages/std": 0.6402774453163147, "advantages/var": 0.40995520698078636, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.951974386339381, "grad_norm": 0.35789550154522065, "learning_rate": 1.0843022029761595e-07, "loss": 0.0, "num_tokens": 145004617.0, "reward": 0.66015625, "reward_std": 0.19701021909713745, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 925 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.323088352971216e-10, "advantages/std": 0.43739765882492065, "advantages/var": 0.1913167119455217, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 3.9562433297758806, "grad_norm": 0.22273311099582965, "learning_rate": 1.076003073606695e-07, "loss": 0.0, "num_tokens": 145158662.0, "reward": 0.76953125, "reward_std": 0.09666222333908081, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 926 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.694597479630824e-09, "advantages/std": 0.4959544241428375, "advantages/var": 0.24597079082685358, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.96051227321238, "grad_norm": 0.2734087805574896, "learning_rate": 1.0677319954997127e-07, "loss": 0.0, "num_tokens": 145303572.0, "reward": 0.77734375, "reward_std": 0.11534436047077179, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 927 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2524220349498645e-09, "advantages/std": 0.5726947784423828, "advantages/var": 0.32797930925516994, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 3.9647812166488796, "grad_norm": 0.28946305611975287, "learning_rate": 1.0594890277822149e-07, "loss": 0.0, "num_tokens": 145479142.0, "reward": 0.5546875, "reward_std": 0.17464229464530945, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49797385931015015, "step": 928 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 7.636414345071518e-09, "advantages/std": 0.64028000831604, "advantages/var": 0.4099584890491883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.969050160085379, "grad_norm": 0.37535894555351246, "learning_rate": 1.0512742293802556e-07, "loss": 0.0, "num_tokens": 145634602.0, "reward": 0.80859375, "reward_std": 0.19819219410419464, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 929 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.898892464424674e-09, "advantages/std": 0.5227991938591003, "advantages/var": 0.2733189970997252, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.9733191035218782, "grad_norm": 0.24714138330863633, "learning_rate": 1.0430876590185162e-07, "loss": 0.0, "num_tokens": 145789751.0, "reward": 0.72265625, "reward_std": 0.14742279052734375, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 930 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344368797067637e-09, "advantages/std": 0.5227872133255005, "advantages/var": 0.27330647041664236, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.9775880469583775, "grad_norm": 0.34603444556490903, "learning_rate": 1.034929375219884e-07, "loss": 0.0, "num_tokens": 145929123.0, "reward": 0.8515625, "reward_std": 0.1349327117204666, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.3562295734882355, "step": 931 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.49293144311269e-09, "advantages/std": 0.5482927560806274, "advantages/var": 0.3006249463704904, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.9818569903948773, "grad_norm": 0.281880223070927, "learning_rate": 1.0267994363050386e-07, "loss": 0.0, "num_tokens": 146071625.0, "reward": 0.83984375, "reward_std": 0.1357315182685852, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.36746934056282043, "step": 932 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.547752029076837e-09, "advantages/std": 0.5483201742172241, "advantages/var": 0.300655013453607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.9861259338313766, "grad_norm": 0.24291507723078548, "learning_rate": 1.0186979003920271e-07, "loss": 0.0, "num_tokens": 146228741.0, "reward": 0.69140625, "reward_std": 0.16754156351089478, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 933 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.8989756861545536e-09, "advantages/std": 0.5227903127670288, "advantages/var": 0.2733097111230478, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.9903948772678763, "grad_norm": 0.24655260154008715, "learning_rate": 1.0106248253958604e-07, "loss": 0.0, "num_tokens": 146376853.0, "reward": 0.7890625, "reward_std": 0.1361146867275238, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 934 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.439360447893462e-09, "advantages/std": 0.5726844668388367, "advantages/var": 0.3279674985584826, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.9946638207043756, "grad_norm": 0.30695823633009073, "learning_rate": 1.0025802690280849e-07, "loss": 0.0, "num_tokens": 146515186.0, "reward": 0.78515625, "reward_std": 0.15991923213005066, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 935 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 7.474437331114142e-09, "advantages/std": 0.40495333075523376, "advantages/var": 0.16398720008975776, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.998932764140875, "grad_norm": 0.19395846323411928, "learning_rate": 9.94564288796384e-08, "loss": 0.0, "num_tokens": 146666516.0, "reward": 0.79296875, "reward_std": 0.08390620350837708, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 936 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975255240285385e-09, "advantages/std": 0.46758967638015747, "advantages/var": 0.2186401054573004, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.004268943436499, "grad_norm": 0.2328061525623079, "learning_rate": 9.865769420041559e-08, "loss": 0.0, "num_tokens": 146800260.0, "reward": 0.7890625, "reward_std": 0.10258589684963226, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 937 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5624185717793762e-09, "advantages/std": 0.5960775017738342, "advantages/var": 0.35530838812093535, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.008537886872999, "grad_norm": 0.3494739089978992, "learning_rate": 9.786182857501118e-08, "loss": -0.0, "num_tokens": 146955997.0, "reward": 0.57421875, "reward_std": 0.1834578514099121, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.49542948603630066, "step": 938 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.90732195053758e-10, "advantages/std": 0.5227848291397095, "advantages/var": 0.2733039775786352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.012806830309499, "grad_norm": 0.2601092761417996, "learning_rate": 9.706883769278639e-08, "loss": 0.0, "num_tokens": 147109230.0, "reward": 0.734375, "reward_std": 0.13151776790618896, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 939 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.3971312155877077e-09, "advantages/std": 0.5482994318008423, "advantages/var": 0.3006322669131265, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.017075773745998, "grad_norm": 0.3054328373484479, "learning_rate": 9.627872722255154e-08, "loss": 0.0, "num_tokens": 147255304.0, "reward": 0.81640625, "reward_std": 0.14256632328033447, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 940 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.877782911581517e-09, "advantages/std": 0.4959692358970642, "advantages/var": 0.24598548295631772, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.021344717182497, "grad_norm": 0.26965248470222386, "learning_rate": 9.549150281252632e-08, "loss": 0.0, "num_tokens": 147411097.0, "reward": 0.69921875, "reward_std": 0.13071897625923157, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 941 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.1175744124147052e-09, "advantages/std": 0.5227828621864319, "advantages/var": 0.27330192099583783, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 4.025613660618997, "grad_norm": 0.21569709619664004, "learning_rate": 9.470717009029888e-08, "loss": -0.0, "num_tokens": 147571275.0, "reward": 0.66796875, "reward_std": 0.13033825159072876, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 942 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2738897237632814e-09, "advantages/std": 0.5483142733573914, "advantages/var": 0.3006485423674441, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.029882604055496, "grad_norm": 0.30706035259478337, "learning_rate": 9.39257346627857e-08, "loss": 0.0, "num_tokens": 147717314.0, "reward": 0.71484375, "reward_std": 0.16017881035804749, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 943 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.344317613955936e-09, "advantages/std": 0.5227922201156616, "advantages/var": 0.2733117054134624, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.034151547491995, "grad_norm": 0.3296832962360881, "learning_rate": 9.314720211619165e-08, "loss": -0.0, "num_tokens": 147872218.0, "reward": 0.72265625, "reward_std": 0.14058800041675568, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 944 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814613434419682e-09, "advantages/std": 0.5227857232093811, "advantages/var": 0.27330491239155563, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.0384204909284955, "grad_norm": 0.3017122128114032, "learning_rate": 9.237157801596957e-08, "loss": 0.0, "num_tokens": 148004908.0, "reward": 0.765625, "reward_std": 0.13269484043121338, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 945 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5629543724332273e-09, "advantages/std": 0.5227810740470886, "advantages/var": 0.27330005138182756, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.042689434364995, "grad_norm": 0.3017288715106481, "learning_rate": 9.159886790678123e-08, "loss": 0.0, "num_tokens": 148171117.0, "reward": 0.640625, "reward_std": 0.12756995856761932, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 946 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131207417015343e-10, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.046958377801494, "grad_norm": 0.24636777783457547, "learning_rate": 9.082907731245731e-08, "loss": -0.0, "num_tokens": 148333037.0, "reward": 0.7109375, "reward_std": 0.16097763180732727, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 947 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.492724637343774e-09, "advantages/std": 0.5483061075210571, "advantages/var": 0.30063958754489306, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.0512273212379935, "grad_norm": 0.2643422583957601, "learning_rate": 9.00622117359574e-08, "loss": 0.0, "num_tokens": 148491408.0, "reward": 0.68359375, "reward_std": 0.14940111339092255, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 948 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 4.408867390103578e-09, "advantages/std": 0.3696673810482025, "advantages/var": 0.13665397261103696, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 4.055496264674493, "grad_norm": 0.24213778856531787, "learning_rate": 8.929827665933209e-08, "loss": -0.0, "num_tokens": 148643434.0, "reward": 0.6640625, "reward_std": 0.0677327960729599, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 949 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.453571100295136e-10, "advantages/std": 0.5227953791618347, "advantages/var": 0.2733150084729665, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.059765208110993, "grad_norm": 0.28065996813370997, "learning_rate": 8.85372775436819e-08, "loss": 0.0, "num_tokens": 148796153.0, "reward": 0.76171875, "reward_std": 0.1434749811887741, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 950 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.966894768009674e-09, "advantages/std": 0.46759626269340515, "advantages/var": 0.21864626488483996, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.064034151547492, "grad_norm": 0.29909557937352554, "learning_rate": 8.777921982911996e-08, "loss": 0.0, "num_tokens": 148926152.0, "reward": 0.79296875, "reward_std": 0.10941822826862335, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 951 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 9.19955663384395e-09, "advantages/std": 0.4049423635005951, "advantages/var": 0.1639783177574481, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 4.068303094983992, "grad_norm": 0.22336800428426953, "learning_rate": 8.702410893473173e-08, "loss": -0.0, "num_tokens": 149081237.0, "reward": 0.66015625, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 952 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987508612632739e-09, "advantages/std": 0.4676083028316498, "advantages/var": 0.2186575248770959, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.072572038420491, "grad_norm": 0.26231833502338076, "learning_rate": 8.627195025853734e-08, "loss": -0.0, "num_tokens": 149230995.0, "reward": 0.75390625, "reward_std": 0.12020084261894226, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 953 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 4.25859719291336e-09, "advantages/std": 0.4373846650123596, "advantages/var": 0.19130534518797404, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 4.07684098185699, "grad_norm": 0.24172333880642313, "learning_rate": 8.552274917745244e-08, "loss": 0.0, "num_tokens": 149399147.0, "reward": 0.60546875, "reward_std": 0.08417458832263947, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 954 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5624051357954486e-09, "advantages/std": 0.5960826277732849, "advantages/var": 0.35531449913310453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.0811099252934895, "grad_norm": 0.3066328634254312, "learning_rate": 8.477651104724992e-08, "loss": -0.0, "num_tokens": 149565867.0, "reward": 0.6484375, "reward_std": 0.1902901977300644, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 955 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5056154153628903e-09, "advantages/std": 0.6185660362243652, "advantages/var": 0.3826239411703227, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.08537886872999, "grad_norm": 0.31224449036267354, "learning_rate": 8.403324120252159e-08, "loss": -0.0, "num_tokens": 149723469.0, "reward": 0.67578125, "reward_std": 0.1836051195859909, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 956 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 1.0456236160976e-08, "advantages/std": 0.46761026978492737, "advantages/var": 0.21865936440833256, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.089647812166489, "grad_norm": 0.26791288036125843, "learning_rate": 8.32929449566398e-08, "loss": 0.0, "num_tokens": 149865420.0, "reward": 0.69921875, "reward_std": 0.1225549504160881, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 957 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.69179720757951e-09, "advantages/std": 0.57268887758255, "advantages/var": 0.327972550506761, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.093916755602988, "grad_norm": 0.26559537702277175, "learning_rate": 8.255562760172003e-08, "loss": 0.0, "num_tokens": 150021469.0, "reward": 0.77734375, "reward_std": 0.16674911975860596, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 958 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.077871021256398e-09, "advantages/std": 0.596076250076294, "advantages/var": 0.3553068959050165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.098185699039488, "grad_norm": 0.2941006947409124, "learning_rate": 8.182129440858259e-08, "loss": -0.0, "num_tokens": 150179899.0, "reward": 0.77734375, "reward_std": 0.18292498588562012, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 959 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.8895334850316917e-09, "advantages/std": 0.36966368556022644, "advantages/var": 0.13665124042196997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.102454642475987, "grad_norm": 0.2398030583187631, "learning_rate": 8.10899506267148e-08, "loss": 0.0, "num_tokens": 150311781.0, "reward": 0.83984375, "reward_std": 0.06549245119094849, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.36746934056282043, "step": 960 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628731285651194e-09, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 4.106723585912487, "grad_norm": 0.2716576868483362, "learning_rate": 8.036160148423449e-08, "loss": 0.0, "num_tokens": 150470368.0, "reward": 0.69921875, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 961 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.2597100125149844e-09, "advantages/std": 0.3696575164794922, "advantages/var": 0.13664667948978604, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.110992529348986, "grad_norm": 0.18986091291824578, "learning_rate": 7.963625218785097e-08, "loss": 0.0, "num_tokens": 150612347.0, "reward": 0.7109375, "reward_std": 0.06089799851179123, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 962 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.881998232812962e-09, "advantages/std": 0.6185729503631592, "advantages/var": 0.3826324949209834, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.115261472785486, "grad_norm": 0.3400034315594602, "learning_rate": 7.891390792282926e-08, "loss": 0.0, "num_tokens": 150759230.0, "reward": 0.7265625, "reward_std": 0.1921473890542984, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 963 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.8777980314856883e-09, "advantages/std": 0.49596524238586426, "advantages/var": 0.24598152165486908, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.119530416221985, "grad_norm": 0.33934023630760835, "learning_rate": 7.819457385295252e-08, "loss": 0.0, "num_tokens": 150910087.0, "reward": 0.62109375, "reward_std": 0.12677115201950073, "rewards/drgrpo_math_reward/mean": 0.62109375, "rewards/drgrpo_math_reward/std": 0.4860650300979614, "step": 964 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.3278015020627767e-09, "advantages/std": 0.701401948928833, "advantages/var": 0.49196469396116527, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.123799359658484, "grad_norm": 0.3693397260938956, "learning_rate": 7.747825512048461e-08, "loss": 0.0, "num_tokens": 151068328.0, "reward": 0.71484375, "reward_std": 0.25619441270828247, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 965 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.633321316168079e-09, "advantages/std": 0.4959716498851776, "advantages/var": 0.2459878774898252, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.128068303094984, "grad_norm": 0.3273626422965575, "learning_rate": 7.676495684613432e-08, "loss": 0.0, "num_tokens": 151198824.0, "reward": 0.83203125, "reward_std": 0.13242888450622559, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.3745708465576172, "step": 966 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.1292784298048236e-09, "advantages/std": 0.4373888075351715, "advantages/var": 0.1913089689570393, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 4.132337246531484, "grad_norm": 0.2688237297891008, "learning_rate": 7.6054684129018e-08, "loss": -0.0, "num_tokens": 151343984.0, "reward": 0.75, "reward_std": 0.0875919908285141, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 967 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.726110817091556e-09, "advantages/std": 0.4374036490917206, "advantages/var": 0.19132195223875303, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.136606189967983, "grad_norm": 0.25840927545905207, "learning_rate": 7.534744204662347e-08, "loss": 0.0, "num_tokens": 151484365.0, "reward": 0.765625, "reward_std": 0.10231749713420868, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 968 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 6.2985861164521495e-09, "advantages/std": 0.36965540051460266, "advantages/var": 0.1366451151296113, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.140875133404482, "grad_norm": 0.21189480775325778, "learning_rate": 7.464323565477371e-08, "loss": -0.0, "num_tokens": 151613008.0, "reward": 0.9140625, "reward_std": 0.0586601160466671, "rewards/drgrpo_math_reward/mean": 0.9140625, "rewards/drgrpo_math_reward/std": 0.28082075715065, "step": 969 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.2463817063291353e-10, "advantages/std": 0.5483036041259766, "advantages/var": 0.3006368422975356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.145144076840982, "grad_norm": 0.3253137810960524, "learning_rate": 7.394206998759011e-08, "loss": 0.0, "num_tokens": 151759662.0, "reward": 0.7265625, "reward_std": 0.14716076850891113, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 970 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.008279339781276e-09, "advantages/std": 0.5227868556976318, "advantages/var": 0.27330609649021653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.149413020277481, "grad_norm": 0.2837713526169662, "learning_rate": 7.324395005745771e-08, "loss": 0.0, "num_tokens": 151906750.0, "reward": 0.73046875, "reward_std": 0.13269728422164917, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 971 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.1939043697180337e-09, "advantages/std": 0.43739062547683716, "advantages/var": 0.19131055925501883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.15368196371398, "grad_norm": 0.22668461022016348, "learning_rate": 7.254888085498812e-08, "loss": -0.0, "num_tokens": 152053010.0, "reward": 0.7890625, "reward_std": 0.08982987701892853, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 972 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.749723319311711e-10, "advantages/std": 0.4049423336982727, "advantages/var": 0.16397829362100325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.1579509071504805, "grad_norm": 0.22761884549822234, "learning_rate": 7.185686734898477e-08, "loss": -0.0, "num_tokens": 152193920.0, "reward": 0.68359375, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 973 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.0082519201446085e-09, "advantages/std": 0.5227904319763184, "advantages/var": 0.27330983576598555, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.16221985058698, "grad_norm": 0.34005738823016896, "learning_rate": 7.116791448640663e-08, "loss": -0.0, "num_tokens": 152334407.0, "reward": 0.703125, "reward_std": 0.13781970739364624, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 974 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.43738046288490295, "advantages/var": 0.19130166931341197, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.166488794023479, "grad_norm": 0.25319669034295045, "learning_rate": 7.048202719233343e-08, "loss": -0.0, "num_tokens": 152475918.0, "reward": 0.7734375, "reward_std": 0.08075720071792603, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 975 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.453593949520833e-10, "advantages/std": 0.5227926969528198, "advantages/var": 0.2733122039872029, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.1707577374599785, "grad_norm": 0.23322850370466372, "learning_rate": 6.979921036993041e-08, "loss": 0.0, "num_tokens": 152627184.0, "reward": 0.7734375, "reward_std": 0.13952961564064026, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 976 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.8166796141722196e-09, "advantages/std": 0.4959683120250702, "advantages/var": 0.24598456653299738, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.175026680896478, "grad_norm": 0.23281228909620943, "learning_rate": 6.911946890041254e-08, "loss": -0.0, "num_tokens": 152793914.0, "reward": 0.58984375, "reward_std": 0.12954191863536835, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 977 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.1818942141274374e-09, "advantages/std": 0.6402619481086731, "advantages/var": 0.4099353621959132, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.179295624332978, "grad_norm": 0.3735471434566129, "learning_rate": 6.844280764301074e-08, "loss": -0.0, "num_tokens": 152958215.0, "reward": 0.703125, "reward_std": 0.1759803295135498, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 978 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4676026999950409, "advantages/var": 0.21865228504265222, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.183564567769477, "grad_norm": 0.33626281644396355, "learning_rate": 6.776923143493635e-08, "loss": 0.0, "num_tokens": 153131159.0, "reward": 0.5546875, "reward_std": 0.1145455539226532, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49797385931015015, "step": 979 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.8778015294086476e-09, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.187833511205977, "grad_norm": 0.2626485019509204, "learning_rate": 6.709874509134682e-08, "loss": 0.0, "num_tokens": 153283971.0, "reward": 0.66015625, "reward_std": 0.1255940943956375, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 980 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.298465769204987e-10, "advantages/std": 0.36966246366500854, "advantages/var": 0.13665033704288376, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 4.192102454642476, "grad_norm": 0.1579751828880399, "learning_rate": 6.643135340531136e-08, "loss": 0.0, "num_tokens": 153432473.0, "reward": 0.73828125, "reward_std": 0.06431539356708527, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 981 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1291849036688268e-09, "advantages/std": 0.6185806393623352, "advantages/var": 0.3826420073939154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.196371398078975, "grad_norm": 0.37827716848300164, "learning_rate": 6.576706114777625e-08, "loss": -0.0, "num_tokens": 153598469.0, "reward": 0.63671875, "reward_std": 0.20186671614646912, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 982 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954491476993248e-08, "advantages/std": 0.46759578585624695, "advantages/var": 0.21864581895052115, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.2006403415154745, "grad_norm": 0.22489880299470943, "learning_rate": 6.510587306753135e-08, "loss": -0.0, "num_tokens": 153753379.0, "reward": 0.7890625, "reward_std": 0.10718280076980591, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 983 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.3472301909481332e-09, "advantages/std": 0.49596890807151794, "advantages/var": 0.24598515777365382, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.204909284951975, "grad_norm": 0.2759049680659352, "learning_rate": 6.444779389117578e-08, "loss": 0.0, "num_tokens": 153906711.0, "reward": 0.78125, "reward_std": 0.13018855452537537, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 984 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.636393538163668e-10, "advantages/std": 0.6402789950370789, "advantages/var": 0.40995719148569165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.209178228388474, "grad_norm": 0.2965646840634463, "learning_rate": 6.379282832308414e-08, "loss": -0.0, "num_tokens": 154066904.0, "reward": 0.6484375, "reward_std": 0.19977852702140808, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.47839346528053284, "step": 985 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.7941619841441395e-09, "advantages/std": 0.5483075380325317, "advantages/var": 0.30064115626329624, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.213447171824973, "grad_norm": 0.3275903862159375, "learning_rate": 6.314098104537325e-08, "loss": -0.0, "num_tokens": 154224905.0, "reward": 0.72265625, "reward_std": 0.15163899958133698, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 986 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.2250465863671716e-09, "advantages/std": 0.4959651231765747, "advantages/var": 0.24598140340755492, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.217716115261473, "grad_norm": 0.2526583838758858, "learning_rate": 6.249225671786784e-08, "loss": 0.0, "num_tokens": 154379416.0, "reward": 0.67578125, "reward_std": 0.1250661313533783, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 987 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5055895915341618e-09, "advantages/std": 0.6185766458511353, "advantages/var": 0.3826370667924408, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 4.221985058697972, "grad_norm": 0.3235101551669753, "learning_rate": 6.184665997806831e-08, "loss": -0.0, "num_tokens": 154560711.0, "reward": 0.62890625, "reward_std": 0.19674183428287506, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.48404383659362793, "step": 988 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4937900080004843e-09, "advantages/std": 0.4675971269607544, "advantages/var": 0.21864707314195186, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.226254002134472, "grad_norm": 0.33109445872143445, "learning_rate": 6.120419544111655e-08, "loss": -0.0, "num_tokens": 154708701.0, "reward": 0.77734375, "reward_std": 0.10889026522636414, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 989 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.659081040337769e-09, "advantages/std": 0.5726781487464905, "advantages/var": 0.3279602620517075, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.230522945570971, "grad_norm": 0.24498457461398332, "learning_rate": 6.056486769976388e-08, "loss": -0.0, "num_tokens": 154874139.0, "reward": 0.64453125, "reward_std": 0.15308444201946259, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.4795927405357361, "step": 990 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.5969216095717883e-09, "advantages/std": 0.4373989999294281, "advantages/var": 0.19131788513926384, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.234791889007471, "grad_norm": 0.22464548027046374, "learning_rate": 5.992868132433753e-08, "loss": 0.0, "num_tokens": 155028752.0, "reward": 0.6953125, "reward_std": 0.0966646745800972, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 991 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.8777983699937923e-09, "advantages/std": 0.4959651529788971, "advantages/var": 0.2459814329693808, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.23906083244397, "grad_norm": 0.25152309433924797, "learning_rate": 5.929564086270833e-08, "loss": -0.0, "num_tokens": 155187937.0, "reward": 0.67578125, "reward_std": 0.1250661313533783, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 992 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.9060405706929244e-10, "advantages/std": 0.5960783958435059, "advantages/var": 0.35530945399136726, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.243329775880469, "grad_norm": 0.24942456745885597, "learning_rate": 5.8665750840258156e-08, "loss": 0.0, "num_tokens": 155347820.0, "reward": 0.6796875, "reward_std": 0.18675413727760315, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 993 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.899481121099718e-09, "advantages/std": 0.4049533009529114, "advantages/var": 0.16398717595265921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.247598719316969, "grad_norm": 0.21808756592922232, "learning_rate": 5.8039015759847207e-08, "loss": 0.0, "num_tokens": 155466166.0, "reward": 0.83203125, "reward_std": 0.08390620350837708, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.3745708465576172, "step": 994 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.907182821412798e-10, "advantages/std": 0.5227929949760437, "advantages/var": 0.27331251559602165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.251867662753469, "grad_norm": 0.2570254852272524, "learning_rate": 5.74154401017824e-08, "loss": -0.0, "num_tokens": 155618360.0, "reward": 0.72265625, "reward_std": 0.14006003737449646, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 995 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.3360734626754018e-09, "advantages/std": 0.5227945446968079, "advantages/var": 0.27331413596474263, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.256136606189968, "grad_norm": 0.27912900097061305, "learning_rate": 5.6795028323784964e-08, "loss": -0.0, "num_tokens": 155761125.0, "reward": 0.76953125, "reward_std": 0.1422979235649109, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 996 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.906087050636722e-10, "advantages/std": 0.5960713028907776, "advantages/var": 0.3553009981299091, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.2604055496264674, "grad_norm": 0.2666367592047003, "learning_rate": 5.61777848609587e-08, "loss": 0.0, "num_tokens": 155926364.0, "reward": 0.73046875, "reward_std": 0.17662307620048523, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 997 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.4536934733273465e-09, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.264674493062967, "grad_norm": 0.27490102092925023, "learning_rate": 5.5563714125758335e-08, "loss": 0.0, "num_tokens": 156075696.0, "reward": 0.7421875, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 998 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.125861622660058e-09, "advantages/std": 0.5227845311164856, "advantages/var": 0.2733036659746837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.268943436499466, "grad_norm": 0.27584388928748027, "learning_rate": 5.495282050795763e-08, "loss": 0.0, "num_tokens": 156227052.0, "reward": 0.74609375, "reward_std": 0.13098736107349396, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 999 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.095582713468022e-09, "advantages/std": 0.548311710357666, "advantages/var": 0.30064573171534903, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.273212379935966, "grad_norm": 0.302465854225609, "learning_rate": 5.434510837461853e-08, "loss": 0.0, "num_tokens": 156386422.0, "reward": 0.578125, "reward_std": 0.15623345971107483, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49482619762420654, "step": 1000 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.00831498586979e-09, "advantages/std": 0.5227822065353394, "advantages/var": 0.2733012354699582, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.277481323372466, "grad_norm": 0.3149036165515859, "learning_rate": 5.3740582070059435e-08, "loss": 0.0, "num_tokens": 156520181.0, "reward": 0.84765625, "reward_std": 0.12927743792533875, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.3600577116012573, "step": 1001 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.7555872617828865e-09, "advantages/std": 0.4959664046764374, "advantages/var": 0.24598267456767164, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.281750266808965, "grad_norm": 0.2360541009064038, "learning_rate": 5.313924591582453e-08, "loss": -0.0, "num_tokens": 156682366.0, "reward": 0.796875, "reward_std": 0.12677361071109772, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.40311288833618164, "step": 1002 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 8.464772819654116e-09, "advantages/std": 0.46759918332099915, "advantages/var": 0.21864899624246537, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.286019210245464, "grad_norm": 0.26427574271744264, "learning_rate": 5.2541104210653e-08, "loss": 0.0, "num_tokens": 156818097.0, "reward": 0.8359375, "reward_std": 0.11165857315063477, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.3710577189922333, "step": 1003 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.8778015294086476e-09, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 4.2902881536819635, "grad_norm": 0.21933021292288724, "learning_rate": 5.1946161230447485e-08, "loss": -0.0, "num_tokens": 156972474.0, "reward": 0.70703125, "reward_std": 0.1255940943956375, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 1004 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 6.324696116718118e-09, "advantages/std": 0.4049423038959503, "advantages/var": 0.16397826948456018, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.294557097118463, "grad_norm": 0.21391185657647827, "learning_rate": 5.135442122824452e-08, "loss": 0.0, "num_tokens": 157113545.0, "reward": 0.80859375, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 1005 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 7.468992883293068e-09, "advantages/std": 0.4675944447517395, "advantages/var": 0.21864456476268757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.298826040554963, "grad_norm": 0.1798798776554831, "learning_rate": 5.076588843418345e-08, "loss": 0.0, "num_tokens": 157269558.0, "reward": 0.81640625, "reward_std": 0.10547532141208649, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 1006 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.516801416758646e-09, "advantages/std": 0.6185721755027771, "advantages/var": 0.3826315363062385, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.303094983991462, "grad_norm": 0.3492523238461835, "learning_rate": 5.018056705547652e-08, "loss": 0.0, "num_tokens": 157423276.0, "reward": 0.70703125, "reward_std": 0.190556138753891, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 1007 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9916444828301443e-09, "advantages/std": 0.4676148593425751, "advantages/var": 0.21866365667797627, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.307363927427962, "grad_norm": 0.2531791458957888, "learning_rate": 4.9598461276378734e-08, "loss": -0.0, "num_tokens": 157568748.0, "reward": 0.765625, "reward_std": 0.12703317403793335, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 1008 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.022351402535043e-09, "advantages/std": 0.6185773611068726, "advantages/var": 0.3826379516739422, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.311632870864461, "grad_norm": 0.3824698656940314, "learning_rate": 4.9019575258157866e-08, "loss": 0.0, "num_tokens": 157717700.0, "reward": 0.60546875, "reward_std": 0.19621387124061584, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.48970720171928406, "step": 1009 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.193799697029419e-09, "advantages/std": 0.43740496039390564, "advantages/var": 0.19132309937719416, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.31590181430096, "grad_norm": 0.2268254209772028, "learning_rate": 4.844391313906482e-08, "loss": 0.0, "num_tokens": 157847100.0, "reward": 0.80859375, "reward_std": 0.10231995582580566, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 1010 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 4.02474204665107e-09, "advantages/std": 0.40494880080223083, "advantages/var": 0.16398353127116483, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.3201707577374595, "grad_norm": 0.18082226148243413, "learning_rate": 4.787147903430383e-08, "loss": 0.0, "num_tokens": 157980420.0, "reward": 0.8125, "reward_std": 0.08048880845308304, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.3910769522190094, "step": 1011 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 6.098430951616284e-09, "advantages/std": 0.5726816654205322, "advantages/var": 0.3279642899088344, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.32443970117396, "grad_norm": 0.28665244349419267, "learning_rate": 4.7302277036003534e-08, "loss": -0.0, "num_tokens": 158142110.0, "reward": 0.67578125, "reward_std": 0.15873728692531586, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 1012 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.816740376924563e-09, "advantages/std": 0.495957612991333, "advantages/var": 0.24597395388406085, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.328708644610459, "grad_norm": 0.2654031864670325, "learning_rate": 4.673631121318672e-08, "loss": -0.0, "num_tokens": 158292692.0, "reward": 0.70703125, "reward_std": 0.11982014775276184, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 1013 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5055969904057808e-09, "advantages/std": 0.6185736060142517, "advantages/var": 0.3826333060574747, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.332977588046958, "grad_norm": 0.36422258318792977, "learning_rate": 4.617358561174278e-08, "loss": -0.0, "num_tokens": 158452300.0, "reward": 0.6796875, "reward_std": 0.19161942601203918, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 1014 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.987587949498236e-09, "advantages/std": 0.23379793763160706, "advantages/var": 0.05466147564079282, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.337246531483458, "grad_norm": 0.08889800429976145, "learning_rate": 4.561410425439743e-08, "loss": -0.0, "num_tokens": 158586944.0, "reward": 0.8203125, "reward_std": 0.027221955358982086, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 1015 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.299886958035194e-09, "advantages/std": 0.40494275093078613, "advantages/var": 0.1639786315313927, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.341515474919957, "grad_norm": 0.17475737149587717, "learning_rate": 4.5057871140684325e-08, "loss": -0.0, "num_tokens": 158734807.0, "reward": 0.6953125, "reward_std": 0.07536394149065018, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 1016 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.520274014732149e-09, "advantages/std": 0.5483058094978333, "advantages/var": 0.3006392607290742, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.345784418356457, "grad_norm": 0.3731423923229412, "learning_rate": 4.450489024691689e-08, "loss": 0.0, "num_tokens": 158886692.0, "reward": 0.6875, "reward_std": 0.14887069165706635, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 1017 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.298465769204987e-10, "advantages/std": 0.36966246366500854, "advantages/var": 0.13665033704288376, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.350053361792956, "grad_norm": 0.22982263969851446, "learning_rate": 4.39551655261593e-08, "loss": 0.0, "num_tokens": 159030327.0, "reward": 0.66796875, "reward_std": 0.06431539356708527, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 1018 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 9.76531722867246e-09, "advantages/std": 0.5960652232170105, "advantages/var": 0.35529375032874455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.354322305229456, "grad_norm": 0.3434728947464965, "learning_rate": 4.340870090819865e-08, "loss": -0.0, "num_tokens": 159188666.0, "reward": 0.73828125, "reward_std": 0.16978827118873596, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 1019 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.226825409861377e-09, "advantages/std": 0.522786021232605, "advantages/var": 0.2733052239962177, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 4.358591248665955, "grad_norm": 0.3046441133239695, "learning_rate": 4.286550029951674e-08, "loss": 0.0, "num_tokens": 159355247.0, "reward": 0.69140625, "reward_std": 0.1332252472639084, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 1020 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.323144933438982e-10, "advantages/std": 0.4373930096626282, "advantages/var": 0.19131264490173194, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.362860192102454, "grad_norm": 0.21511357782167576, "learning_rate": 4.232556758326211e-08, "loss": 0.0, "num_tokens": 159497734.0, "reward": 0.87109375, "reward_std": 0.09100939333438873, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33575257658958435, "step": 1021 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4083371836861376e-09, "advantages/std": 0.4959692358970642, "advantages/var": 0.24598548295631772, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.3671291355389545, "grad_norm": 0.25509993985117824, "learning_rate": 4.178890661922241e-08, "loss": 0.0, "num_tokens": 159660252.0, "reward": 0.63671875, "reward_std": 0.13071897625923157, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 1022 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.9876135291586783e-09, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 4.371398078975454, "grad_norm": 0.2952935455145491, "learning_rate": 4.125552124379628e-08, "loss": -0.0, "num_tokens": 159797853.0, "reward": 0.86328125, "reward_std": 0.10376540571451187, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.34422317147254944, "step": 1023 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.481332608054646e-09, "advantages/std": 0.4676010310649872, "advantages/var": 0.2186507242530391, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.375667022411953, "grad_norm": 0.24696306722713282, "learning_rate": 4.072541526996681e-08, "loss": 0.0, "num_tokens": 159956444.0, "reward": 0.65625, "reward_std": 0.11230766773223877, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 1024 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 7.64347128426064e-09, "advantages/std": 0.5483047366142273, "advantages/var": 0.30063808419359717, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 4.3799359658484525, "grad_norm": 0.2722365045836188, "learning_rate": 4.019859248727342e-08, "loss": 0.0, "num_tokens": 160136653.0, "reward": 0.61328125, "reward_std": 0.14886824786663055, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 1025 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.821674987396077e-09, "advantages/std": 0.5483134388923645, "advantages/var": 0.30064762726997074, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.384204909284952, "grad_norm": 0.29008508597364296, "learning_rate": 3.967505666178555e-08, "loss": -0.0, "num_tokens": 160293294.0, "reward": 0.65234375, "reward_std": 0.15900175273418427, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 1026 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2738974786069975e-09, "advantages/std": 0.5483109354972839, "advantages/var": 0.30064488198590666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.388473852721451, "grad_norm": 0.3482378111702543, "learning_rate": 3.915481153607525e-08, "loss": 0.0, "num_tokens": 160445614.0, "reward": 0.6875, "reward_std": 0.15676140785217285, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 1027 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.794179709875018e-09, "advantages/std": 0.5483061075210571, "advantages/var": 0.30063958754489306, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.392742796157951, "grad_norm": 0.25458533017295193, "learning_rate": 3.8637860829190185e-08, "loss": 0.0, "num_tokens": 160605812.0, "reward": 0.76171875, "reward_std": 0.14940111339092255, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 1028 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.81670043231303e-09, "advantages/std": 0.4959646463394165, "advantages/var": 0.2459809304185825, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.397011739594451, "grad_norm": 0.2933507838338875, "learning_rate": 3.812420823662782e-08, "loss": 0.0, "num_tokens": 160746801.0, "reward": 0.7421875, "reward_std": 0.1261245161294937, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 1029 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.9833938148533065e-09, "advantages/std": 0.46760255098342896, "advantages/var": 0.21865214568621028, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.40128068303095, "grad_norm": 0.30265927689089706, "learning_rate": 3.76138574303082e-08, "loss": 0.0, "num_tokens": 160887984.0, "reward": 0.7890625, "reward_std": 0.11613436043262482, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 1030 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 5.174635975246255e-09, "advantages/std": 0.4049513339996338, "advantages/var": 0.16398558290808296, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.405549626467449, "grad_norm": 0.20903610857543548, "learning_rate": 3.7106812058548375e-08, "loss": 0.0, "num_tokens": 161046011.0, "reward": 0.69140625, "reward_std": 0.08166831731796265, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 1031 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.8786649560713735e-09, "advantages/std": 0.286345511674881, "advantages/var": 0.0819937520563494, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.4098185699039485, "grad_norm": 0.1688138649146096, "learning_rate": 3.660307574603588e-08, "loss": 0.0, "num_tokens": 161173076.0, "reward": 0.80859375, "reward_std": 0.04221830889582634, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 1032 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.481491415916831e-09, "advantages/std": 0.4675844609737396, "advantages/var": 0.21863522814410263, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.414087513340448, "grad_norm": 0.2795226378853111, "learning_rate": 3.6102652093802974e-08, "loss": -0.0, "num_tokens": 161313095.0, "reward": 0.71875, "reward_std": 0.0974610298871994, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1033 }, { "advantages/mean": -4.6566128730773926e-09, "advantages/snr": 8.4927717217003e-09, "advantages/std": 0.5483030676841736, "advantages/var": 0.30063625403187544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.418356456776948, "grad_norm": 0.28811601274226684, "learning_rate": 3.560554467920096e-08, "loss": 0.0, "num_tokens": 161456325.0, "reward": 0.80078125, "reward_std": 0.1465141326189041, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 1034 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.065668450848788e-09, "advantages/std": 0.5726749300956726, "advantages/var": 0.3279565755600835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.422625400213447, "grad_norm": 0.24685666805511652, "learning_rate": 3.5111757055874326e-08, "loss": 0.0, "num_tokens": 161608634.0, "reward": 0.7265625, "reward_std": 0.14966705441474915, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 1035 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.218797108153148e-09, "advantages/std": 0.5483075380325317, "advantages/var": 0.30064115626329624, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 4.426894343649947, "grad_norm": 0.3321275267280128, "learning_rate": 3.4621292753735765e-08, "loss": 0.0, "num_tokens": 161769820.0, "reward": 0.68359375, "reward_std": 0.15163899958133698, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 1036 }, { "advantages/mean": -5.122274160385132e-09, "advantages/snr": 1.0954451680481181e-08, "advantages/std": 0.46759748458862305, "advantages/var": 0.21864740759360757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.431163287086446, "grad_norm": 0.3842088085646925, "learning_rate": 3.413415527894059e-08, "loss": 0.0, "num_tokens": 161910800.0, "reward": 0.84375, "reward_std": 0.10942068696022034, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.3638034462928772, "step": 1037 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 5.3229534351711075e-09, "advantages/std": 0.4374087452888489, "advantages/var": 0.19132641045516507, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 4.435432230522945, "grad_norm": 0.21771886571873586, "learning_rate": 3.365034811386186e-08, "loss": 0.0, "num_tokens": 162069764.0, "reward": 0.68359375, "reward_std": 0.1052069365978241, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.4659844934940338, "step": 1038 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4083616409231023e-09, "advantages/std": 0.49596062302589417, "advantages/var": 0.2459769395922331, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.439701173959445, "grad_norm": 0.28642282646933764, "learning_rate": 3.316987471706556e-08, "loss": -0.0, "num_tokens": 162225218.0, "reward": 0.71875, "reward_std": 0.12217670679092407, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1039 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.97115066346681e-09, "advantages/std": 0.4675883650779724, "advantages/var": 0.2186388791562912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.443970117395945, "grad_norm": 0.25478303447053396, "learning_rate": 3.269273852328547e-08, "loss": -0.0, "num_tokens": 162372123.0, "reward": 0.76953125, "reward_std": 0.10087841749191284, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1040 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.3970688057076e-09, "advantages/std": 0.5483095049858093, "advantages/var": 0.30064331325778326, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 4.448239060832444, "grad_norm": 0.30279611286008074, "learning_rate": 3.2218942943399105e-08, "loss": 0.0, "num_tokens": 162533746.0, "reward": 0.7109375, "reward_std": 0.15452352166175842, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 1041 }, { "advantages/mean": 3.958120942115784e-09, "advantages/snr": 6.640369898388981e-09, "advantages/std": 0.5960693359375, "advantages/var": 0.35529865324497223, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.452508004268943, "grad_norm": 0.2731987958255392, "learning_rate": 3.174849136440294e-08, "loss": -0.0, "num_tokens": 162687667.0, "reward": 0.77734375, "reward_std": 0.17491313815116882, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41684433817863464, "step": 1042 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.98068239603254e-09, "advantages/std": 0.4959627091884613, "advantages/var": 0.24597900890555824, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.456776947705443, "grad_norm": 0.2104365270449523, "learning_rate": 3.128138714938855e-08, "loss": 0.0, "num_tokens": 162834651.0, "reward": 0.76953125, "reward_std": 0.12335620820522308, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1043 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.439373903985093e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.461045891141943, "grad_norm": 0.3192273891677802, "learning_rate": 3.081763363751844e-08, "loss": 0.0, "num_tokens": 162987603.0, "reward": 0.703125, "reward_std": 0.15650182962417603, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 1044 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.234946226282646e-09, "advantages/std": 0.5227998495101929, "advantages/var": 0.2733196826478803, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.465314834578442, "grad_norm": 0.2526000182994563, "learning_rate": 3.035723414400176e-08, "loss": -0.0, "num_tokens": 163145350.0, "reward": 0.74609375, "reward_std": 0.14848363399505615, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 1045 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.6722020691798937e-09, "advantages/std": 0.5227837562561035, "advantages/var": 0.27330285580524105, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.4695837780149414, "grad_norm": 0.32390884070268056, "learning_rate": 2.990019196007154e-08, "loss": 0.0, "num_tokens": 163305062.0, "reward": 0.69140625, "reward_std": 0.13151532411575317, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 1046 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 7.041755735282429e-09, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 4.473852721451441, "grad_norm": 0.26562549241892064, "learning_rate": 2.9446510352959918e-08, "loss": 0.0, "num_tokens": 163472473.0, "reward": 0.63671875, "reward_std": 0.1255941092967987, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.48188701272010803, "step": 1047 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.749659422971484e-10, "advantages/std": 0.40494683384895325, "advantages/var": 0.16398193824429175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.47812166488794, "grad_norm": 0.29683929045193824, "learning_rate": 2.8996192565876042e-08, "loss": -0.0, "num_tokens": 163597754.0, "reward": 0.7578125, "reward_std": 0.07825092226266861, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 1048 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.0655749347956924e-10, "advantages/std": 0.572688102722168, "advantages/var": 0.3279716629995164, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.482390608324439, "grad_norm": 0.3116239712728128, "learning_rate": 2.8549241817982017e-08, "loss": -0.0, "num_tokens": 163745480.0, "reward": 0.70703125, "reward_std": 0.16557207703590393, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 1049 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.749631495215814e-10, "advantages/std": 0.40494880080223083, "advantages/var": 0.16398353127116483, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.48665955176094, "grad_norm": 0.19240322344362223, "learning_rate": 2.8105661304370253e-08, "loss": -0.0, "num_tokens": 163889341.0, "reward": 0.828125, "reward_std": 0.08048880845308304, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 1050 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 6.920088884979619e-09, "advantages/std": 0.4373929798603058, "advantages/var": 0.19131261883107786, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.490928495197439, "grad_norm": 0.2267817838396342, "learning_rate": 2.766545419604066e-08, "loss": -0.0, "num_tokens": 164040469.0, "reward": 0.78515625, "reward_std": 0.09100939333438873, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 1051 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.129230843823993e-09, "advantages/std": 0.4373985826969147, "advantages/var": 0.1913175201452697, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.495197438633938, "grad_norm": 0.2770427223945021, "learning_rate": 2.722862363987749e-08, "loss": 0.0, "num_tokens": 164193848.0, "reward": 0.51171875, "reward_std": 0.096134252846241, "rewards/drgrpo_math_reward/mean": 0.51171875, "rewards/drgrpo_math_reward/std": 0.5008418560028076, "step": 1052 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.680390924281249e-09, "advantages/std": 0.5227926969528198, "advantages/var": 0.2733122039872029, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.4994663820704375, "grad_norm": 0.2707090909721515, "learning_rate": 2.6795172758627584e-08, "loss": 0.0, "num_tokens": 164344284.0, "reward": 0.78125, "reward_std": 0.13952963054180145, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 1053 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.347276877087511e-09, "advantages/std": 0.4959590435028076, "advantages/var": 0.24597537283221982, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.503735325506937, "grad_norm": 0.21248376510657957, "learning_rate": 2.636510465087771e-08, "loss": 0.0, "num_tokens": 164496584.0, "reward": 0.640625, "reward_std": 0.11993881314992905, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 1054 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.259648064183464e-09, "advantages/std": 0.3696756958961487, "advantages/var": 0.1366601201363018, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 4.508004268943436, "grad_norm": 0.1704470192455297, "learning_rate": 2.5938422391032055e-08, "loss": 0.0, "num_tokens": 164636662.0, "reward": 0.79296875, "reward_std": 0.07456512749195099, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 1055 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.680342941005758e-09, "advantages/std": 0.5227964520454407, "advantages/var": 0.27331613027130075, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.512273212379936, "grad_norm": 0.29609821089912336, "learning_rate": 2.5515129029290984e-08, "loss": 0.0, "num_tokens": 164779163.0, "reward": 0.71875, "reward_std": 0.1434774398803711, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1056 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.397123829683448e-09, "advantages/std": 0.5483006238937378, "advantages/var": 0.3006335741622621, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.516542155816436, "grad_norm": 0.3321857749921581, "learning_rate": 2.5095227591628466e-08, "loss": -0.0, "num_tokens": 164935062.0, "reward": 0.78125, "reward_std": 0.1442737877368927, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 1057 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.4856073226994137e-09, "advantages/std": 0.4675840735435486, "advantages/var": 0.21863486583157865, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.520811099252935, "grad_norm": 0.2390333733297495, "learning_rate": 2.467872107977098e-08, "loss": 0.0, "num_tokens": 165097134.0, "reward": 0.61328125, "reward_std": 0.0969306156039238, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.4879522919654846, "step": 1058 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.633372091520215e-09, "advantages/std": 0.49596717953681946, "advantages/var": 0.2459834431777077, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.525080042689434, "grad_norm": 0.2169048732862071, "learning_rate": 2.4265612471176032e-08, "loss": -0.0, "num_tokens": 165250060.0, "reward": 0.7734375, "reward_std": 0.12953945994377136, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 1059 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 6.29855717175748e-09, "advantages/std": 0.36965709924697876, "advantages/var": 0.1366463710236907, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.5293489861259335, "grad_norm": 0.23679888686417735, "learning_rate": 2.3855904719010443e-08, "loss": 0.0, "num_tokens": 165392113.0, "reward": 0.76171875, "reward_std": 0.060367584228515625, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 1060 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.0656379836078813e-10, "advantages/std": 0.5726792216300964, "advantages/var": 0.3279614908868531, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.533617929562434, "grad_norm": 0.23892417050547263, "learning_rate": 2.3449600752129596e-08, "loss": 0.0, "num_tokens": 165549716.0, "reward": 0.734375, "reward_std": 0.154791921377182, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.4425306022167206, "step": 1061 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.0191165426936045e-08, "advantages/std": 0.548311710357666, "advantages/var": 0.30064573171534903, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.537886872998933, "grad_norm": 0.28337521087616024, "learning_rate": 2.304670347505655e-08, "loss": 0.0, "num_tokens": 165699524.0, "reward": 0.7578125, "reward_std": 0.15623344480991364, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 1062 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262474074661002e-09, "advantages/std": 0.5726819634437561, "advantages/var": 0.3279646312537956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.542155816435432, "grad_norm": 0.2737285430712695, "learning_rate": 2.264721576796108e-08, "loss": -0.0, "num_tokens": 165850014.0, "reward": 0.8046875, "reward_std": 0.15926769375801086, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.39721766114234924, "step": 1063 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.680435099969692e-09, "advantages/std": 0.5227892398834229, "advantages/var": 0.27330858933788704, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.546424759871932, "grad_norm": 0.34394186222827744, "learning_rate": 2.2251140486639063e-08, "loss": 0.0, "num_tokens": 166011272.0, "reward": 0.58984375, "reward_std": 0.13611222803592682, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49282538890838623, "step": 1064 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4937803921789752e-09, "advantages/std": 0.46760013699531555, "advantages/var": 0.21864988811803787, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.550693703308431, "grad_norm": 0.24316829786415944, "learning_rate": 2.1858480462492278e-08, "loss": -0.0, "num_tokens": 166167939.0, "reward": 0.6796875, "reward_std": 0.11283563077449799, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 1065 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 6.298247429400683e-09, "advantages/std": 0.36967527866363525, "advantages/var": 0.13665981165503638, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 4.554962646744931, "grad_norm": 0.2219964251372802, "learning_rate": 2.1469238502507926e-08, "loss": 0.0, "num_tokens": 166314808.0, "reward": 0.8203125, "reward_std": 0.07403472065925598, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38467901945114136, "step": 1066 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.694562781299176e-10, "advantages/std": 0.4959580898284912, "advantages/var": 0.24597442686632576, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 4.55923159018143, "grad_norm": 0.3653896044998966, "learning_rate": 2.1083417389238855e-08, "loss": 0.0, "num_tokens": 166455155.0, "reward": 0.6640625, "reward_std": 0.11876175552606583, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 1067 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.439362225104603e-09, "advantages/std": 0.5726840496063232, "advantages/var": 0.3279670206734977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 4.56350053361793, "grad_norm": 0.269576721593581, "learning_rate": 2.070101988078332e-08, "loss": -0.0, "num_tokens": 166620743.0, "reward": 0.640625, "reward_std": 0.16097761690616608, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.4807571768760681, "step": 1068 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.038778708786431e-09, "advantages/std": 0.36966201663017273, "advantages/var": 0.1366500065390861, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.567769477054429, "grad_norm": 0.147475285462347, "learning_rate": 2.0322048710765483e-08, "loss": -0.0, "num_tokens": 166783154.0, "reward": 0.6796875, "reward_std": 0.06378498673439026, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 1069 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4373958706855774, "advantages/var": 0.19131514769279434, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.572038420490928, "grad_norm": 0.2517716303588861, "learning_rate": 1.9946506588315814e-08, "loss": 0.0, "num_tokens": 166925860.0, "reward": 0.75390625, "reward_std": 0.09442433714866638, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 1070 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.131242115120482e-10, "advantages/std": 0.5726816058158875, "advantages/var": 0.3279642216398635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.576307363927428, "grad_norm": 0.3634852172155903, "learning_rate": 1.9574396198051958e-08, "loss": 0.0, "num_tokens": 167073133.0, "reward": 0.76953125, "reward_std": 0.15703225135803223, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1071 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.2997950517032087e-09, "advantages/std": 0.40495893359184265, "advantages/var": 0.16399173789584243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.580576307363927, "grad_norm": 0.24460457844348826, "learning_rate": 1.920572020005884e-08, "loss": -0.0, "num_tokens": 167213513.0, "reward": 0.7421875, "reward_std": 0.08850065618753433, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.4382871091365814, "step": 1072 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.1639892700339495e-09, "advantages/std": 0.49596095085144043, "advantages/var": 0.2459772647694649, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.584845250800427, "grad_norm": 0.25322361819317524, "learning_rate": 1.8840481229870643e-08, "loss": -0.0, "num_tokens": 167346389.0, "reward": 0.79296875, "reward_std": 0.12270711362361908, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 1073 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.755668279409157e-09, "advantages/std": 0.4959557056427002, "advantages/var": 0.24597206195954868, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.5891141942369265, "grad_norm": 0.25045970534297207, "learning_rate": 1.84786818984512e-08, "loss": 0.0, "num_tokens": 167490353.0, "reward": 0.7734375, "reward_std": 0.11705183982849121, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 1074 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.008270656855743e-09, "advantages/std": 0.5227879881858826, "advantages/var": 0.2733072805914425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 4.593383137673426, "grad_norm": 0.2542750259013028, "learning_rate": 1.8120324792175567e-08, "loss": 0.0, "num_tokens": 167654451.0, "reward": 0.6796875, "reward_std": 0.1344047635793686, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 1075 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.3360743766432835e-09, "advantages/std": 0.5227941870689392, "advantages/var": 0.273313762033073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.597652081109925, "grad_norm": 0.2591197084066371, "learning_rate": 1.776541247281177e-08, "loss": 0.0, "num_tokens": 167808715.0, "reward": 0.6953125, "reward_std": 0.1417675018310547, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 1076 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 8.450023102116827e-09, "advantages/std": 0.4959692358970642, "advantages/var": 0.24598548295631772, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.601921024546424, "grad_norm": 0.2634345872404297, "learning_rate": 1.7413947477501913e-08, "loss": 0.0, "num_tokens": 167956325.0, "reward": 0.71484375, "reward_std": 0.13071896135807037, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.4523732364177704, "step": 1077 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.6946234331898e-10, "advantages/std": 0.49595168232917786, "advantages/var": 0.24596807120514175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.606189967982925, "grad_norm": 0.30768554301495155, "learning_rate": 1.7065932318744702e-08, "loss": -0.0, "num_tokens": 168099570.0, "reward": 0.75, "reward_std": 0.11310402303934097, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.4338609278202057, "step": 1078 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5624132598511334e-09, "advantages/std": 0.5960795283317566, "advantages/var": 0.3553108040962094, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.610458911419424, "grad_norm": 0.3892376803698612, "learning_rate": 1.6721369484377078e-08, "loss": -0.0, "num_tokens": 168248423.0, "reward": 0.69921875, "reward_std": 0.18687279522418976, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 1079 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 3.872947870535686e-09, "advantages/std": 0.6612887978553772, "advantages/var": 0.4373028741690099, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.614727854855923, "grad_norm": 0.3490531845018821, "learning_rate": 1.6380261437556662e-08, "loss": 0.0, "num_tokens": 168418683.0, "reward": 0.6875, "reward_std": 0.2284420132637024, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 1080 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.0646189037466052e-09, "advantages/std": 0.43739715218544006, "advantages/var": 0.19131626873993302, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.6189967982924225, "grad_norm": 0.253860150661586, "learning_rate": 1.604261061674378e-08, "loss": 0.0, "num_tokens": 168557074.0, "reward": 0.78125, "reward_std": 0.09442678093910217, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 1081 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4083580865314121e-09, "advantages/std": 0.49596187472343445, "advantages/var": 0.24597818117918369, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.623265741728922, "grad_norm": 0.255409443386475, "learning_rate": 1.570841943568446e-08, "loss": 0.0, "num_tokens": 168708907.0, "reward": 0.73828125, "reward_std": 0.1238841712474823, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.4404313564300537, "step": 1082 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175609054645574e-09, "advantages/std": 0.5227851271629333, "advantages/var": 0.2733042891827644, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.627534685165422, "grad_norm": 0.2720860789972887, "learning_rate": 1.5377690283392975e-08, "loss": 0.0, "num_tokens": 168841842.0, "reward": 0.78515625, "reward_std": 0.13204818964004517, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4115184545516968, "step": 1083 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.987646852646199e-09, "advantages/std": 0.4675866663455963, "advantages/var": 0.218637290544188, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.631803628601921, "grad_norm": 0.23422061530175992, "learning_rate": 1.505042552413466e-08, "loss": -0.0, "num_tokens": 168987902.0, "reward": 0.76171875, "reward_std": 0.09864053130149841, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 1084 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 6.181901817442994e-09, "advantages/std": 0.6402755975723267, "advantages/var": 0.4099528408466, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.636072572038421, "grad_norm": 0.3098506110740803, "learning_rate": 1.4726627497409272e-08, "loss": 0.0, "num_tokens": 169149174.0, "reward": 0.69140625, "reward_std": 0.19530031085014343, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 1085 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5483114123344421, "advantages/var": 0.3006454048961906, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.64034151547492, "grad_norm": 0.29092441908057226, "learning_rate": 1.4406298517934067e-08, "loss": 0.0, "num_tokens": 169294764.0, "reward": 0.74609375, "reward_std": 0.15570303797721863, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.4360972046852112, "step": 1086 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 6.387665982680258e-09, "advantages/std": 0.4374004006385803, "advantages/var": 0.19131911047879058, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.644610458911419, "grad_norm": 0.3130123884635255, "learning_rate": 1.4089440875627356e-08, "loss": 0.0, "num_tokens": 169431889.0, "reward": 0.76953125, "reward_std": 0.09837214648723602, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1087 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.481457711149072e-09, "advantages/std": 0.46758797764778137, "advantages/var": 0.2186385168407421, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.6488794023479185, "grad_norm": 0.2311164862473825, "learning_rate": 1.3776056835592131e-08, "loss": -0.0, "num_tokens": 169598500.0, "reward": 0.5703125, "reward_std": 0.10034800320863724, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4960011839866638, "step": 1088 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.323110477241685e-09, "advantages/std": 0.437395840883255, "advantages/var": 0.19131512162196973, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.653148345784419, "grad_norm": 0.22020508120163657, "learning_rate": 1.3466148638099528e-08, "loss": -0.0, "num_tokens": 169749373.0, "reward": 0.84765625, "reward_std": 0.09442433714866638, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.3600577116012573, "step": 1089 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.285228162442336e-09, "advantages/std": 0.5726901888847351, "advantages/var": 0.3279740524448336, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.657417289220918, "grad_norm": 0.2975456107825843, "learning_rate": 1.3159718498573558e-08, "loss": -0.0, "num_tokens": 169893473.0, "reward": 0.76953125, "reward_std": 0.16728198528289795, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1090 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.562928780215032e-09, "advantages/std": 0.5227848291397095, "advantages/var": 0.2733039775786352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.661686232657417, "grad_norm": 0.2822154330392377, "learning_rate": 1.2856768607574564e-08, "loss": 0.0, "num_tokens": 170042017.0, "reward": 0.765625, "reward_std": 0.13151776790618896, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42443734407424927, "step": 1091 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0646287690683442e-09, "advantages/std": 0.43739309906959534, "advantages/var": 0.19131272311370484, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.665955176093917, "grad_norm": 0.3146296229451411, "learning_rate": 1.2557301130783849e-08, "loss": -0.0, "num_tokens": 170191746.0, "reward": 0.69140625, "reward_std": 0.09271440654993057, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 1092 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.5455193155458818e-09, "advantages/std": 0.6402679681777954, "advantages/var": 0.40994307107452244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.670224119530416, "grad_norm": 0.3272099063255255, "learning_rate": 1.2261318208988292e-08, "loss": 0.0, "num_tokens": 170340786.0, "reward": 0.75390625, "reward_std": 0.18505056202411652, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.43157756328582764, "step": 1093 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.218884999076207e-09, "advantages/std": 0.5483008623123169, "advantages/var": 0.3006338356124303, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.674493062966915, "grad_norm": 0.34863042945311856, "learning_rate": 1.1968821958064701e-08, "loss": 0.0, "num_tokens": 170489979.0, "reward": 0.76953125, "reward_std": 0.1448042094707489, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1094 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4084047181915234e-09, "advantages/std": 0.49594545364379883, "advantages/var": 0.2459618929899534, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.678762006403415, "grad_norm": 0.2538455050726734, "learning_rate": 1.167981446896521e-08, "loss": -0.0, "num_tokens": 170656817.0, "reward": 0.66796875, "reward_std": 0.10627168416976929, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 1095 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.958592436801074e-10, "advantages/std": 0.46759748458862305, "advantages/var": 0.21864740759360757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.683030949839915, "grad_norm": 0.28690968099611613, "learning_rate": 1.1394297807701736e-08, "loss": 0.0, "num_tokens": 170786325.0, "reward": 0.7890625, "reward_std": 0.10942068696022034, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4087733030319214, "step": 1096 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.273931268670851e-09, "advantages/std": 0.5482963919639587, "advantages/var": 0.3006289334406951, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.687299893276414, "grad_norm": 0.32286900459011325, "learning_rate": 1.1112274015331657e-08, "loss": 0.0, "num_tokens": 170947675.0, "reward": 0.76171875, "reward_std": 0.13967934250831604, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 1097 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.511192125942753e-09, "advantages/std": 0.49596524238586426, "advantages/var": 0.24598152165486908, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.691568836712913, "grad_norm": 0.2753894199046987, "learning_rate": 1.083374510794305e-08, "loss": 0.0, "num_tokens": 171090866.0, "reward": 0.65234375, "reward_std": 0.12677115201950073, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 1098 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6262492693233955e-09, "advantages/std": 0.5726813077926636, "advantages/var": 0.3279638802951155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 4.695837780149413, "grad_norm": 0.3435572525051184, "learning_rate": 1.0558713076640413e-08, "loss": 0.0, "num_tokens": 171236588.0, "reward": 0.78125, "reward_std": 0.15650184452533722, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41420844197273254, "step": 1099 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.816709402826297e-09, "advantages/std": 0.49596306681632996, "advantages/var": 0.24597936364585937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.700106723585913, "grad_norm": 0.2888164459361838, "learning_rate": 1.0287179887530139e-08, "loss": 0.0, "num_tokens": 171383928.0, "reward": 0.7109375, "reward_std": 0.12388662248849869, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 1100 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814751550759118e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.704375667022412, "grad_norm": 0.34120205918851826, "learning_rate": 1.0019147481706625e-08, "loss": 0.0, "num_tokens": 171541379.0, "reward": 0.703125, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 1101 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.983395592005843e-09, "advantages/std": 0.46760234236717224, "advantages/var": 0.21865195058726616, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 4.7086446104589115, "grad_norm": 0.26555135370742966, "learning_rate": 9.754617775238561e-09, "loss": 0.0, "num_tokens": 171693104.0, "reward": 0.72265625, "reward_std": 0.114015132188797, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 1102 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.975343977137235e-09, "advantages/std": 0.46758273243904114, "advantages/var": 0.21863361167515993, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.712913553895411, "grad_norm": 0.252917843314809, "learning_rate": 9.493592659155002e-09, "loss": 0.0, "num_tokens": 171854339.0, "reward": 0.8125, "reward_std": 0.09522314369678497, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.3910769522190094, "step": 1103 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.694503823521619e-10, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.71718249733191, "grad_norm": 0.27037004134189363, "learning_rate": 9.236073999431937e-09, "loss": 0.0, "num_tokens": 172009884.0, "reward": 0.81640625, "reward_std": 0.1255941092967987, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 1104 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.944934388860789e-09, "advantages/std": 0.5483036041259766, "advantages/var": 0.3006368422975356, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.72145144076841, "grad_norm": 0.2863127137607284, "learning_rate": 8.98206363697901e-09, "loss": 0.0, "num_tokens": 172153790.0, "reward": 0.7265625, "reward_std": 0.14716076850891113, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 1105 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.481366596560484e-09, "advantages/std": 0.46759748458862305, "advantages/var": 0.21864740759360757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.72572038420491, "grad_norm": 0.20646929432145555, "learning_rate": 8.731563387626096e-09, "loss": -0.0, "num_tokens": 172319948.0, "reward": 0.6328125, "reward_std": 0.10942068696022034, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 1106 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.789693917499295e-09, "advantages/std": 0.5227907299995422, "advantages/var": 0.27331014737345427, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 4.729989327641409, "grad_norm": 0.3268089437339258, "learning_rate": 8.484575042110698e-09, "loss": 0.0, "num_tokens": 172479337.0, "reward": 0.76171875, "reward_std": 0.13835011422634125, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 1107 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1231638491478496e-09, "advantages/std": 0.5483105778694153, "advantages/var": 0.3006444898034921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.734258271077908, "grad_norm": 0.24190386381838827, "learning_rate": 8.241100366064902e-09, "loss": 0.0, "num_tokens": 172634544.0, "reward": 0.81640625, "reward_std": 0.1545259803533554, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 1108 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.821694928466129e-09, "advantages/std": 0.5483105778694153, "advantages/var": 0.3006444898034921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.7385272145144075, "grad_norm": 0.26512176008869315, "learning_rate": 8.001141100002884e-09, "loss": -0.0, "num_tokens": 172786862.0, "reward": 0.81640625, "reward_std": 0.1545259654521942, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 1109 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016666532322234e-09, "advantages/std": 0.5227798223495483, "advantages/var": 0.2732987426558253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.742796157950907, "grad_norm": 0.30151062432531645, "learning_rate": 7.764698959308313e-09, "loss": 0.0, "num_tokens": 172940709.0, "reward": 0.81640625, "reward_std": 0.1258624941110611, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.387910932302475, "step": 1110 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.8894634138202136e-09, "advantages/std": 0.3696773946285248, "advantages/var": 0.13666137609933404, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.747065101387407, "grad_norm": 0.15944846996665116, "learning_rate": 7.531775634222137e-09, "loss": -0.0, "num_tokens": 173076717.0, "reward": 0.8359375, "reward_std": 0.07627260684967041, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.3710577189922333, "step": 1111 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.661563943316494e-09, "advantages/std": 0.4373944103717804, "advantages/var": 0.19131387022447743, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.751334044823906, "grad_norm": 0.2350230283206485, "learning_rate": 7.302372789830702e-09, "loss": 0.0, "num_tokens": 173224124.0, "reward": 0.7265625, "reward_std": 0.09271685779094696, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 1112 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5483001470565796, "advantages/var": 0.3006330512622668, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.755602988260406, "grad_norm": 0.2799137495336964, "learning_rate": 7.076492066053486e-09, "loss": 0.0, "num_tokens": 173368075.0, "reward": 0.76171875, "reward_std": 0.14203834533691406, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 1113 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.4854364888597375e-09, "advantages/std": 0.4676069915294647, "advantages/var": 0.2186562985272369, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.759871931696905, "grad_norm": 0.23008597518051577, "learning_rate": 6.854135077631773e-09, "loss": 0.0, "num_tokens": 173517174.0, "reward": 0.828125, "reward_std": 0.11849337071180344, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 1114 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 9.765177591033836e-09, "advantages/std": 0.5960737466812134, "advantages/var": 0.35530391148257934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.764140875133404, "grad_norm": 0.2973781606816029, "learning_rate": 6.6353034141168325e-09, "loss": 0.0, "num_tokens": 173677131.0, "reward": 0.6953125, "reward_std": 0.18056842684745789, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4611765742301941, "step": 1115 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 1.0113837549730904e-08, "advantages/std": 0.4373989701271057, "advantages/var": 0.19131785906825272, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.7684098185699035, "grad_norm": 0.20405016184291036, "learning_rate": 6.419998639858537e-09, "loss": -0.0, "num_tokens": 173839238.0, "reward": 0.671875, "reward_std": 0.0966646745800972, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.47045037150382996, "step": 1116 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.36966416239738464, "advantages/var": 0.13665159296095997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.772678762006404, "grad_norm": 0.19423563422651047, "learning_rate": 6.208222293994425e-09, "loss": 0.0, "num_tokens": 173969936.0, "reward": 0.890625, "reward_std": 0.06602286547422409, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31272050738334656, "step": 1117 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.3360912852746437e-09, "advantages/std": 0.5227875709533691, "advantages/var": 0.273306844343324, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.776947705442903, "grad_norm": 0.2424510021874874, "learning_rate": 5.999975890438435e-09, "loss": -0.0, "num_tokens": 174132488.0, "reward": 0.67578125, "reward_std": 0.13546313345432281, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.46899911761283875, "step": 1118 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.336079555818232e-09, "advantages/std": 0.5227921605110168, "advantages/var": 0.2733116430917768, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.781216648879402, "grad_norm": 0.36431189368295286, "learning_rate": 5.795260917870359e-09, "loss": 0.0, "num_tokens": 174298944.0, "reward": 0.66796875, "reward_std": 0.13888297975063324, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.4718646705150604, "step": 1119 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5969250913959587e-09, "advantages/std": 0.4373980462551117, "advantages/var": 0.19131705086778883, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 4.785485592315902, "grad_norm": 0.2691325592767996, "learning_rate": 5.594078839724792e-09, "loss": -0.0, "num_tokens": 174460048.0, "reward": 0.71875, "reward_std": 0.09719263762235641, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1120 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 6.1027632923521535e-09, "advantages/std": 0.49597176909446716, "advantages/var": 0.24598799573869545, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.789754535752401, "grad_norm": 0.27735708577103635, "learning_rate": 5.396431094181197e-09, "loss": -0.0, "num_tokens": 174608479.0, "reward": 0.73046875, "reward_std": 0.13413390517234802, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 1121 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.235163042765691e-09, "advantages/std": 0.5227816700935364, "advantages/var": 0.2733006745857871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.794023479188901, "grad_norm": 0.27998724553881144, "learning_rate": 5.202319094153252e-09, "loss": 0.0, "num_tokens": 174767612.0, "reward": 0.7109375, "reward_std": 0.12863078713417053, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45421501994132996, "step": 1122 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.9749955192391796e-09, "advantages/std": 0.4676100015640259, "advantages/var": 0.21865911356270828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.7982924226254005, "grad_norm": 0.2236879992519063, "learning_rate": 5.011744227278625e-09, "loss": 0.0, "num_tokens": 174924408.0, "reward": 0.69140625, "reward_std": 0.12243872880935669, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.46281787753105164, "step": 1123 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.698561914841724e-09, "advantages/std": 0.5483006238937378, "advantages/var": 0.3006335741622621, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.8025613660619, "grad_norm": 0.28551119505496925, "learning_rate": 4.824707855909605e-09, "loss": -0.0, "num_tokens": 175075666.0, "reward": 0.71875, "reward_std": 0.1442737877368927, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1124 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.4536934733273464e-10, "advantages/std": 0.5227810144424438, "advantages/var": 0.2732999890614707, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.806830309498399, "grad_norm": 0.2896090207354788, "learning_rate": 4.641211317102822e-09, "loss": -0.0, "num_tokens": 175218993.0, "reward": 0.7734375, "reward_std": 0.12756997346878052, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 1125 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.9752719972572655e-09, "advantages/std": 0.4675883650779724, "advantages/var": 0.2186388791562912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.811099252934898, "grad_norm": 0.2796216225903626, "learning_rate": 4.461255922609985e-09, "loss": 0.0, "num_tokens": 175350083.0, "reward": 0.87109375, "reward_std": 0.10087842494249344, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33575257658958435, "step": 1126 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.4896779409655654e-09, "advantages/std": 0.46759188175201416, "advantages/var": 0.2186421678803896, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.815368196371399, "grad_norm": 0.23411543424994902, "learning_rate": 4.284842958868329e-09, "loss": -0.0, "num_tokens": 175494985.0, "reward": 0.76953125, "reward_std": 0.10376540571451187, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1127 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.477180109708889e-09, "advantages/std": 0.46760138869285583, "advantages/var": 0.21865105870748724, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.819637139807898, "grad_norm": 0.28312151408955855, "learning_rate": 4.111973686991676e-09, "loss": 0.0, "num_tokens": 175640384.0, "reward": 0.76171875, "reward_std": 0.11283808946609497, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.4268665909767151, "step": 1128 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 7.1258372493495695e-09, "advantages/std": 0.5227863192558289, "advantages/var": 0.2733055356010574, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.823906083244397, "grad_norm": 0.2462476751541139, "learning_rate": 3.9426493427611175e-09, "loss": 0.0, "num_tokens": 175784391.0, "reward": 0.71875, "reward_std": 0.1337556689977646, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1129 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23379866778850555, "advantages/var": 0.054661817059679985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.8281750266808965, "grad_norm": 0.17545168464173302, "learning_rate": 3.776871136616289e-09, "loss": -0.0, "num_tokens": 175921949.0, "reward": 0.79296875, "reward_std": 0.02775236964225769, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 1130 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.812195974509691e-10, "advantages/std": 0.5960696339607239, "advantages/var": 0.35529900853007135, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.832443970117396, "grad_norm": 0.322489514678151, "learning_rate": 3.614640253646828e-09, "loss": -0.0, "num_tokens": 176092087.0, "reward": 0.6796875, "reward_std": 0.17544355988502502, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 1131 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 6.5454632784124295e-09, "advantages/std": 0.6402834057807922, "advantages/var": 0.40996283971825065, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.836712913553895, "grad_norm": 0.3875750637098193, "learning_rate": 3.4559578535837685e-09, "loss": 0.0, "num_tokens": 176254779.0, "reward": 0.6328125, "reward_std": 0.20437544584274292, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 1132 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.694570397962185e-10, "advantages/std": 0.49595728516578674, "advantages/var": 0.2459736287090175, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.840981856990394, "grad_norm": 0.3219642366491413, "learning_rate": 3.3008250707913242e-09, "loss": 0.0, "num_tokens": 176420160.0, "reward": 0.6875, "reward_std": 0.11928972601890564, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4644203782081604, "step": 1133 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.8747867624157402e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.845250800426895, "grad_norm": 0.22062801731458345, "learning_rate": 3.14924301425884e-09, "loss": -0.0, "num_tokens": 176576177.0, "reward": 0.703125, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 1134 }, { "advantages/mean": 4.423782229423523e-09, "advantages/snr": 9.460670050669734e-09, "advantages/std": 0.4675971269607544, "advantages/var": 0.21864707314195186, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.849519743863394, "grad_norm": 0.21905543535634842, "learning_rate": 3.00121276759252e-09, "loss": -0.0, "num_tokens": 176716311.0, "reward": 0.79296875, "reward_std": 0.10889027267694473, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40597182512283325, "step": 1135 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.991768376879134e-09, "advantages/std": 0.4675857722759247, "advantages/var": 0.2186364544348729, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.853788687299893, "grad_norm": 0.2494609999273987, "learning_rate": 2.856735389008269e-09, "loss": -0.0, "num_tokens": 176874861.0, "reward": 0.65234375, "reward_std": 0.09916849434375763, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.4771590530872345, "step": 1136 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.149914704966296e-09, "advantages/std": 0.40495288372039795, "advantages/var": 0.16398683803346614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.8580576307363925, "grad_norm": 0.23420297830571496, "learning_rate": 2.7158119113234732e-09, "loss": -0.0, "num_tokens": 177010122.0, "reward": 0.7734375, "reward_std": 0.08337578922510147, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 1137 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 4.024806323518198e-09, "advantages/std": 0.4049423336982727, "advantages/var": 0.16397829362100325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 4.862326574172892, "grad_norm": 0.30097179874734115, "learning_rate": 2.578443341950176e-09, "loss": 0.0, "num_tokens": 177165560.0, "reward": 0.70703125, "reward_std": 0.07483352720737457, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 1138 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.504941560504481e-09, "advantages/std": 0.5726861953735352, "advantages/var": 0.3279694783714149, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.866595517609392, "grad_norm": 0.30906655932357985, "learning_rate": 2.4446306628875813e-09, "loss": 0.0, "num_tokens": 177325268.0, "reward": 0.65625, "reward_std": 0.1626875400543213, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.47588926553726196, "step": 1139 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.8167369917440553e-09, "advantages/std": 0.49595820903778076, "advantages/var": 0.24597454511196304, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 4.870864461045891, "grad_norm": 0.25293915422456575, "learning_rate": 2.31437483071506e-09, "loss": -0.0, "num_tokens": 177501239.0, "reward": 0.5546875, "reward_std": 0.12046678364276886, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49797385931015015, "step": 1140 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.0956275813194834e-09, "advantages/std": 0.5483068823814392, "advantages/var": 0.3006404372668534, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.875133404482391, "grad_norm": 0.27334241170458684, "learning_rate": 2.1876767765853233e-09, "loss": 0.0, "num_tokens": 177657926.0, "reward": 0.70703125, "reward_std": 0.15057817101478577, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 1141 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2739222670442577e-09, "advantages/std": 0.5483002662658691, "advantages/var": 0.300633181987223, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.87940234791889, "grad_norm": 0.2710624584587459, "learning_rate": 2.0645374062179253e-09, "loss": 0.0, "num_tokens": 177795708.0, "reward": 0.80078125, "reward_std": 0.1437433660030365, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.40019527077674866, "step": 1142 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.5628544429995396e-09, "advantages/std": 0.5227957367897034, "advantages/var": 0.2733153824054888, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.883671291355389, "grad_norm": 0.2565490653909916, "learning_rate": 1.9449575998924383e-09, "loss": 0.0, "num_tokens": 177938848.0, "reward": 0.828125, "reward_std": 0.14400538802146912, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 1143 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.97924068182451e-10, "advantages/std": 0.4676026999950409, "advantages/var": 0.21865228504265222, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 4.887940234791889, "grad_norm": 0.22628308527208368, "learning_rate": 1.8289382124426211e-09, "loss": 0.0, "num_tokens": 178102728.0, "reward": 0.6328125, "reward_std": 0.1145455539226532, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48298248648643494, "step": 1144 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1232125491970075e-09, "advantages/std": 0.5482980012893677, "advantages/var": 0.30063069821791544, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.892209178228389, "grad_norm": 0.2379483920272022, "learning_rate": 1.7164800732498154e-09, "loss": -0.0, "num_tokens": 178254574.0, "reward": 0.76953125, "reward_std": 0.14032843708992004, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1145 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.878675704298026e-09, "advantages/std": 0.5726897716522217, "advantages/var": 0.3279735745550738, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 4.896478121664888, "grad_norm": 0.35643188938727866, "learning_rate": 1.6075839862374486e-09, "loss": 0.0, "num_tokens": 178401659.0, "reward": 0.7578125, "reward_std": 0.16834037005901337, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.4292463958263397, "step": 1146 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.97100182115522e-09, "advantages/std": 0.4675983488559723, "advantages/var": 0.21864821585283156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.900747065101387, "grad_norm": 0.24224725482315074, "learning_rate": 1.5022507298649845e-09, "loss": 0.0, "num_tokens": 178553534.0, "reward": 0.71875, "reward_std": 0.10889272391796112, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1147 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 6.337630938135611e-09, "advantages/std": 0.3306402266025543, "advantages/var": 0.10932295944778847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.905016008537887, "grad_norm": 0.1679248225157117, "learning_rate": 1.4004810571225378e-09, "loss": 0.0, "num_tokens": 178707051.0, "reward": 0.7265625, "reward_std": 0.05444391071796417, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 1148 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4083695114256918e-09, "advantages/std": 0.4959578514099121, "advantages/var": 0.24597419037513646, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.909284951974386, "grad_norm": 0.21289099107193096, "learning_rate": 1.30227569552549e-09, "loss": -0.0, "num_tokens": 178858493.0, "reward": 0.73046875, "reward_std": 0.11993636190891266, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44458550214767456, "step": 1149 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.779016395729458e-09, "advantages/std": 0.3696686327457428, "advantages/var": 0.13665489803610686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.913553895410886, "grad_norm": 0.21853704015214817, "learning_rate": 1.2076353471089927e-09, "loss": 0.0, "num_tokens": 178990574.0, "reward": 0.921875, "reward_std": 0.06890985369682312, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.26889389753341675, "step": 1150 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 1.0114006384407448e-08, "advantages/std": 0.4373916685581207, "advantages/var": 0.19131147172405694, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.9178228388473855, "grad_norm": 0.23268145856156358, "learning_rate": 1.116560688423418e-09, "loss": 0.0, "num_tokens": 179147929.0, "reward": 0.6796875, "reward_std": 0.09100693464279175, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 1151 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.970936510266691e-09, "advantages/std": 0.4676027297973633, "advantages/var": 0.21865231291394593, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.922091782283885, "grad_norm": 0.2005730784728622, "learning_rate": 1.0290523705291932e-09, "loss": 0.0, "num_tokens": 179308414.0, "reward": 0.703125, "reward_std": 0.1145455539226532, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45777595043182373, "step": 1152 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.694553754175022e-09, "advantages/std": 0.4959590435028076, "advantages/var": 0.24597537283221982, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.926360725720384, "grad_norm": 0.29268173253105517, "learning_rate": 9.45111018992306e-10, "loss": 0.0, "num_tokens": 179442130.0, "reward": 0.828125, "reward_std": 0.11993882060050964, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3780108094215393, "step": 1153 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7814881544564379e-09, "advantages/std": 0.5227778553962708, "advantages/var": 0.2732966860927242, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.930629669156883, "grad_norm": 0.2273045762887075, "learning_rate": 8.647372338795866e-10, "loss": -0.0, "num_tokens": 179593784.0, "reward": 0.71875, "reward_std": 0.12468298524618149, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1154 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1175086559963265e-09, "advantages/std": 0.5227938890457153, "advantages/var": 0.2733134504235437, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.934898612593383, "grad_norm": 0.2420054917145749, "learning_rate": 7.8793158975482e-10, "loss": -0.0, "num_tokens": 179753798.0, "reward": 0.70703125, "reward_std": 0.1412370800971985, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.45601576566696167, "step": 1155 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 6.9283207245813425e-09, "advantages/std": 0.36966201663017273, "advantages/var": 0.1366500065390861, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.939167556029883, "grad_norm": 0.17994478010217374, "learning_rate": 7.146946356743067e-10, "loss": 0.0, "num_tokens": 179901476.0, "reward": 0.6640625, "reward_std": 0.06378498673439026, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 1156 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.0655376988318266e-10, "advantages/std": 0.5726933479309082, "advantages/var": 0.3279776707643123, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.943436499466382, "grad_norm": 0.3042820187636512, "learning_rate": 6.450268951830318e-10, "loss": 0.0, "num_tokens": 180062992.0, "reward": 0.6640625, "reward_std": 0.17069938778877258, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4732423722743988, "step": 1157 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.2861526764651336e-09, "advantages/std": 0.49596431851387024, "advantages/var": 0.24598060523892773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 4.9477054429028815, "grad_norm": 0.23530944447092877, "learning_rate": 5.789288663110014e-10, "loss": 0.0, "num_tokens": 180224794.0, "reward": 0.72265625, "reward_std": 0.1255940943956375, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4485645890235901, "step": 1158 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 8.016719545759423e-09, "advantages/std": 0.5227763652801514, "advantages/var": 0.27329512809552625, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.951974386339381, "grad_norm": 0.298315077604583, "learning_rate": 5.164010215695791e-10, "loss": 0.0, "num_tokens": 180364639.0, "reward": 0.7734375, "reward_std": 0.12244509160518646, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 1159 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.755670987579601e-09, "advantages/std": 0.49595534801483154, "advantages/var": 0.24597170722451267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.95624332977588, "grad_norm": 0.2191656669611469, "learning_rate": 4.574438079480991e-10, "loss": 0.0, "num_tokens": 180506762.0, "reward": 0.82421875, "reward_std": 0.11652141809463501, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3813795745372772, "step": 1160 }, { "advantages/mean": -5.587935447692871e-09, "advantages/snr": 1.1950243137431801e-08, "advantages/std": 0.46760013699531555, "advantages/var": 0.21864988811803787, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.96051227321238, "grad_norm": 0.18746099127830204, "learning_rate": 4.020576469108139e-10, "loss": 0.0, "num_tokens": 180642341.0, "reward": 0.8671875, "reward_std": 0.11283563077449799, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.3400367796421051, "step": 1161 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.691755147905178e-09, "advantages/std": 0.5726931095123291, "advantages/var": 0.3279773976829006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.96478121664888, "grad_norm": 0.2954386105161857, "learning_rate": 3.5024293439372967e-10, "loss": -0.0, "num_tokens": 180813205.0, "reward": 0.66015625, "reward_std": 0.17187398672103882, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.47458380460739136, "step": 1162 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.694550651109113e-09, "advantages/std": 0.4959593713283539, "advantages/var": 0.245975698008416, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.969050160085379, "grad_norm": 0.24627764577496902, "learning_rate": 3.020000408018863e-10, "loss": 0.0, "num_tokens": 180953416.0, "reward": 0.80859375, "reward_std": 0.12046922743320465, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.39417871832847595, "step": 1163 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.855392400745098e-09, "advantages/std": 0.4373980164527893, "advantages/var": 0.19131702479683454, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.973319103521878, "grad_norm": 0.19705677177548778, "learning_rate": 2.573293110065822e-10, "loss": 0.0, "num_tokens": 181091777.0, "reward": 0.7265625, "reward_std": 0.09719263762235641, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.446596622467041, "step": 1164 }, { "advantages/mean": -3.958120942115784e-09, "advantages/snr": 7.571227974076312e-09, "advantages/std": 0.5227845311164856, "advantages/var": 0.2733036659746837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.9775880469583775, "grad_norm": 0.31218340104221065, "learning_rate": 2.1623106434309757e-10, "loss": 0.0, "num_tokens": 181237625.0, "reward": 0.76953125, "reward_std": 0.13098736107349396, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.4219578504562378, "step": 1165 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.224992848592354e-09, "advantages/std": 0.33064761757850647, "advantages/var": 0.10932784701034226, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 4.981856990394878, "grad_norm": 0.15329749593412742, "learning_rate": 1.787055946081417e-10, "loss": -0.0, "num_tokens": 181372123.0, "reward": 0.7734375, "reward_std": 0.059568777680397034, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.41942715644836426, "step": 1166 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.972431973776251e-09, "advantages/std": 0.5483101010322571, "advantages/var": 0.30064396689400397, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.986125933831377, "grad_norm": 0.22789400713755284, "learning_rate": 1.4475317005802067e-10, "loss": -0.0, "num_tokens": 181532071.0, "reward": 0.6796875, "reward_std": 0.15558436512947083, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4675106406211853, "step": 1167 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.505066136085465e-09, "advantages/std": 0.5726752281188965, "advantages/var": 0.3279569169010301, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.990394877267876, "grad_norm": 0.2972979244184956, "learning_rate": 1.1437403340652796e-10, "loss": 0.0, "num_tokens": 181682041.0, "reward": 0.69921875, "reward_std": 0.15019746124744415, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45949608087539673, "step": 1168 }, { "advantages/mean": -4.190951585769653e-09, "advantages/snr": 1.1337001660336367e-08, "advantages/std": 0.36967018246650696, "advantages/var": 0.13665604380482055, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.994663820704376, "grad_norm": 0.2458655812447713, "learning_rate": 8.756840182344571e-11, "loss": 0.0, "num_tokens": 181810281.0, "reward": 0.87109375, "reward_std": 0.06891229748725891, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33575257658958435, "step": 1169 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.528047335495492e-10, "advantages/std": 0.618568480014801, "advantages/var": 0.3826269644678213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.998932764140875, "grad_norm": 0.32316307035032915, "learning_rate": 6.433646693265737e-11, "loss": 0.0, "num_tokens": 181970221.0, "reward": 0.71875, "reward_std": 0.18596169352531433, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45048993825912476, "step": 1170 } ], "logging_steps": 1, "max_steps": 1175, "num_input_tokens_seen": 181970221, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }